danieldk HF Staff commited on 25 days ago

Commit

f9dfc57

verified ·

1 Parent(s): 578605e

Build uploaded using `kernels`.

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch210-cxx11-cpu-x86_64-linux/{_megablocks_cpu_6e04dec.abi3.so → _megablocks_cpu_a45325d.abi3.so} +1 -1
build/torch210-cxx11-cpu-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cpu-x86_64-linux/megablocks/__init__.py +2 -2
build/torch210-cxx11-cpu-x86_64-linux/metadata.json +4 -1
build/torch210-cxx11-cpu-x86_64-linux/ops/histogram_benchmark.py +26 -26
build/torch210-cxx11-cpu-x86_64-linux/ops/matmul_benchmark.py +362 -362
build/torch210-cxx11-cpu-x86_64-linux/ops/padded_scatter_benchmark.py +45 -45
build/torch210-cxx11-cpu-x86_64-linux/ops/permute_benchmark.py +118 -118
build/torch210-cxx11-cpu-x86_64-linux/ops/sort_benchmark.py +27 -27
build/torch210-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops_test.py +35 -35
build/torch210-cxx11-cpu-x86_64-linux/stk/ops/linear_ops_test.py +116 -116
build/torch210-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops_test.py +52 -52
build/torch210-cxx11-cpu-x86_64-linux/stk/random/random_ops_test.py +63 -63
build/torch210-cxx11-cu126-x86_64-linux/{_megablocks_cuda_6e04dec.abi3.so → _megablocks_cuda_a45325d.abi3.so} +1 -1
build/torch210-cxx11-cu126-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu126-x86_64-linux/megablocks/__init__.py +2 -2
build/torch210-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py +26 -26
build/torch210-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py +362 -362
build/torch210-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py +45 -45
build/torch210-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py +118 -118
build/torch210-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py +27 -27
build/torch210-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py +35 -35
build/torch210-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py +116 -116
build/torch210-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py +52 -52
build/torch210-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py +63 -63
build/torch210-cxx11-cu128-x86_64-linux/{_megablocks_cuda_6e04dec.abi3.so → _megablocks_cuda_a45325d.abi3.so} +1 -1
build/torch210-cxx11-cu128-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu128-x86_64-linux/megablocks/__init__.py +2 -2
build/torch210-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py +26 -26
build/torch210-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py +362 -362
build/torch210-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py +45 -45
build/torch210-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py +118 -118
build/torch210-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py +27 -27
build/torch210-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py +35 -35
build/torch210-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py +116 -116
build/torch210-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py +52 -52
build/torch210-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py +63 -63
build/torch210-cxx11-cu130-x86_64-linux/{_megablocks_cuda_6e04dec.abi3.so → _megablocks_cuda_a45325d.abi3.so} +1 -1
build/torch210-cxx11-cu130-x86_64-linux/_ops.py +3 -3
build/torch210-cxx11-cu130-x86_64-linux/megablocks/__init__.py +2 -2
build/torch210-cxx11-cu130-x86_64-linux/ops/histogram_benchmark.py +26 -26
build/torch210-cxx11-cu130-x86_64-linux/ops/matmul_benchmark.py +362 -362
build/torch210-cxx11-cu130-x86_64-linux/ops/padded_scatter_benchmark.py +45 -45
build/torch210-cxx11-cu130-x86_64-linux/ops/permute_benchmark.py +118 -118
build/torch210-cxx11-cu130-x86_64-linux/ops/sort_benchmark.py +27 -27
build/torch210-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops_test.py +35 -35
build/torch210-cxx11-cu130-x86_64-linux/stk/ops/linear_ops_test.py +116 -116
build/torch210-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops_test.py +52 -52
build/torch210-cxx11-cu130-x86_64-linux/stk/random/random_ops_test.py +63 -63
build/torch210-cxx11-xpu20253-x86_64-linux/{_megablocks_xpu_6e04dec.abi3.so → _megablocks_xpu_a45325d.abi3.so} +1 -1

build/torch210-cxx11-cpu-x86_64-linux/{_megablocks_cpu_6e04dec.abi3.so → _megablocks_cpu_a45325d.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:70b79b772262fee7ee79153a54dc208c9166f4c34680f752b7bc2ce8d8ae1f74
 size 2219080

 version https://git-lfs.github.com/spec/v1
+oid sha256:ef67276bfac31793120c3afb0e2579d50f8af875102a253f92ac8f170eec604b
 size 2219080

build/torch210-cxx11-cpu-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cpu_6e04dec
-ops = torch.ops._megablocks_cpu_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cpu_6e04dec::{op_name}"

 import torch
+from . import _megablocks_cpu_a45325d
+ops = torch.ops._megablocks_cpu_a45325d
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cpu_a45325d::{op_name}"

build/torch210-cxx11-cpu-x86_64-linux/megablocks/__init__.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import ctypes
 import sys
-import importlib
 from pathlib import Path
 from types import ModuleType
 def _import_from_path(file_path: Path) -> ModuleType:
     # We cannot use the module name as-is, after adding it to `sys.modules`,
     # it would also be used for other imports. So, we make a module name that

 import ctypes
+import importlib.util
 import sys
 from pathlib import Path
 from types import ModuleType
 def _import_from_path(file_path: Path) -> ModuleType:
     # We cannot use the module name as-is, after adding it to `sys.modules`,
     # it would also be used for other imports. So, we make a module name that

build/torch210-cxx11-cpu-x86_64-linux/metadata.json CHANGED Viewed

@@ -1,5 +1,8 @@
 {
   "version": 1,
   "license": "Apache-2.0",
-  "python-depends": []
 }

 {
   "version": 1,
   "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cpu"
+  }
 }

build/torch210-cxx11-cpu-x86_64-linux/ops/histogram_benchmark.py CHANGED Viewed

@@ -5,7 +5,7 @@ import unittest
 import numpy as np
 import torch
-from absl.testing import parameterized
 from .. import ops
@@ -47,31 +47,31 @@ def log_benchmark(arguments, mean_t, std_t):
     print('=' * 60)
-class HistogramBenchmark(parameterized.TestCase):
-    @parameterized.parameters(*_HISTOGRAM_TESTS)
-    def testHistogram(self, n, dtype, max_val):
-        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
-        arguments = {
-            'n': n,
-            'dtype': dtype,
-            'max_val': max_val,
-        }
-        log_benchmark(arguments, mean_t, std_t)
-    @parameterized.parameters(*_HISTOGRAM_TESTS)
-    def testTorchHistogram(self, n, dtype, max_val):
-        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
-        arguments = {
-            'n': n,
-            'dtype': dtype,
-            'max_val': max_val,
-        }
-        log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

 import numpy as np
 import torch
+# from absl.testing import parameterized
 from .. import ops
     print('=' * 60)
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

build/torch210-cxx11-cpu-x86_64-linux/ops/matmul_benchmark.py CHANGED Viewed

@@ -17,7 +17,7 @@ import unittest
 from .. import stk
 import torch
-from absl.testing import parameterized
 from .. import benchmark_util, ops
@@ -48,367 +48,367 @@ def log_benchmark(name, arguments, time, std, flops):
     print('=' * 60)
-class MatmulBenchmark(parameterized.TestCase):
-    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
-        blocking = 128
-        padded_tokens, _ = x.size()
-        assert padded_tokens % blocking == 0
-        assert fhs % blocking == 0
-        # Offsets for the sparse matrix. All rows have the
-        # same number of nonzero blocks dictated by the
-        # dimensionality of a single expert.
-        block_rows = padded_tokens // blocking
-        blocks_per_row = fhs // blocking
-        offsets = torch.arange(
-            0,
-            block_rows * blocks_per_row + 1,
-            blocks_per_row,
-            dtype=torch.int32,
-            device=x.device,
-        )
-        # Indices for the sparse matrix. The indices for
-        # the intermediate matrix are dynamic depending
-        # on the mapping of tokens to experts.
-        column_indices = ops.topology(
-            padded_bins,
-            blocking,
-            block_rows,
-            blocks_per_row,
-        )
-        data = torch.empty(
-            column_indices.numel(),
-            blocking,
-            blocking,
-            dtype=torch.float16,
-            device=x.device,
-        )
-        shape = (padded_tokens, fhs * ne)
-        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
-        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
-    def build_input_matrix(self, sl, hs, ne):
-        x = torch.randn((sl, hs)).cuda().half()
-        # Assign tokens to experts uniformly.
-        top_expert = torch.arange(0, sl).cuda().int() % ne
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
-        return out, padded_bins
-    def build_weight_matrix(self, ne, hs, fhs):
-        return torch.randn((hs, ne * fhs)).cuda().half()
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        w = transpose_view(w)
-        def benchmark():
-            return stk.ops.sdd(x, w, topo)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::Fwd::SDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        def benchmark():
-            return stk.ops.dsd(topo, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::GradX::DSD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        topo = topo.t()
-        def benchmark():
-            return stk.ops.dsd(topo, x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::GradW::DSD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        def benchmark():
-            return stk.ops.dsd(x, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::Fwd::DSD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.nnz * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        out = stk.ops.dsd(x, w)
-        w = transpose_view(w)
-        def benchmark():
-            return stk.ops.sdd(out, w, x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradX::SDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.nnz * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        out = stk.ops.dsd(x, w)
-        x = x.t()
-        def benchmark():
-            return stk.ops.dsd(x, out)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradW::DSD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.nnz * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, hs)).cuda().half()
-        w = torch.randn((ne, hs, fhs)).cuda().half()
-        w = w.transpose(1, 2).contiguous()
-        w = w.transpose(1, 2)
-        def benchmark():
-            return torch.bmm(x, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::Fwd:DDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, hs)).cuda().half()
-        w = torch.randn((ne, hs, fhs)).cuda().half()
-        out = torch.bmm(x, w)
-        w = w.transpose(1, 2).contiguous()
-        def benchmark():
-            return torch.bmm(out, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0:GradX:DDD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, hs)).cuda().half()
-        w = torch.randn((ne, hs, fhs)).cuda().half()
-        out = torch.bmm(x, w)
-        out = out.transpose(1, 2)
-        def benchmark():
-            return torch.bmm(out, x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0:GradW:DDD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
-        w = torch.randn((ne, fhs, hs)).cuda().half()
-        def benchmark():
-            return torch.bmm(x, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::Fwd::DDD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
-        w = torch.randn((ne, fhs, hs)).cuda().half()
-        out = torch.bmm(x, w)
-        w = torch.transpose(w, 1, 2)
-        def benchmark():
-            return torch.bmm(out, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradX::DDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
-        w = torch.randn((ne, fhs, hs)).cuda().half()
-        out = torch.bmm(x, w)
-        x = torch.transpose(x, 1, 2)
-        def benchmark():
-            return torch.bmm(x, out)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradW::DDD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * hs * 2,
-        )
 if __name__ == '__main__':

 from .. import stk
 import torch
+# from absl.testing import parameterized
 from .. import benchmark_util, ops
     print('=' * 60)
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
 if __name__ == '__main__':

build/torch210-cxx11-cpu-x86_64-linux/ops/padded_scatter_benchmark.py CHANGED Viewed

@@ -4,7 +4,7 @@
 import unittest
 import torch
-from absl.testing import parameterized
 from .. import benchmark_util, ops
@@ -16,50 +16,50 @@ _PADDED_SCATTER_BENCHMARK = (
 )
-class PaddedScatterTest(parameterized.TestCase):
-    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
-    def testPaddedScatter(self, sl, hs, ne, top_k):
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        # Randomly assign tokens to experts.
-        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        # Sample weights for the scatter reduce.
-        weights = torch.rand((sl * top_k,)).cuda().half()
-        # Gather the data to prepare for backwards.
-        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
-        def benchmark():
-            return ops.padded_scatter(
-                x,
-                indices,
-                bin_ids,
-                weights,
-                bins,
-                padded_bins,
-                top_k,
-            )
-        time, std = benchmark_util.benchmark_function(benchmark)
-        benchmark_util.log_benchmark(
-            'Padded Scatter',
-            {
-                'sequence_length': sl,
-                'hidden_size': hs,
-                'num_experts': ne,
-                'top_k': top_k,
-            },
-            time,
-            std,
-        )
 if __name__ == '__main__':

 import unittest
 import torch
+# from absl.testing import parameterized
 from .. import benchmark_util, ops
 )
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
 if __name__ == '__main__':

build/torch210-cxx11-cpu-x86_64-linux/ops/permute_benchmark.py CHANGED Viewed

@@ -4,7 +4,7 @@
 import unittest
 import torch
-from absl.testing import parameterized
 from .. import benchmark_util, ops
@@ -26,123 +26,123 @@ _PERMUTE_TESTS = (
 )
-class PermuteBenchmark(parameterized.TestCase):
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testBinnedGather(self, sl, hs, ne):
-        # NOTE: Capacity factor == 1.
-        ec = sl // ne
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(indices, ne)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        def benchmark():
-            return ops.binned_gather(x, indices, bins, ec)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testBinnedScatter(self, sl, hs, ne):
-        # NOTE: Capacity factor == 1.
-        ec = sl // ne
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(indices, ne)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        x = ops.binned_gather(x, indices, bins, ec)
-        def benchmark():
-            return ops.binned_scatter(x, indices, bins)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testPaddedGather(self, sl, hs, ne):
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        # Randomly assign tokens to experts.
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        def benchmark():
-            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testPaddedScatter(self, sl, hs, ne):
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        # Randomly assign tokens to experts.
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
-        def benchmark():
-            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testCopy(self, sl, hs, ne):
-        # NOTE: Capacity factor == 1.
-        # ec = sl // ne
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        y = x.clone()
-        def benchmark():
-            return y.copy_(x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
 if __name__ == '__main__':

 import unittest
 import torch
+# from absl.testing import parameterized
 from .. import benchmark_util, ops
 )
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
 if __name__ == '__main__':

build/torch210-cxx11-cpu-x86_64-linux/ops/sort_benchmark.py CHANGED Viewed

@@ -5,7 +5,7 @@ import unittest
 import numpy as np
 import torch
-from absl.testing import parameterized
 from .. import ops
@@ -53,32 +53,32 @@ def log_benchmark(arguments, mean_t, std_t):
     print('=' * 60)
-class SortBenchmark(parameterized.TestCase):
-    @parameterized.parameters(*_SORT_TESTS)
-    def testSort(self, n, dtype, max_val):
-        if max_val is None:
-            max_val = np.iinfo(numpy_dtype(dtype)).max
-        end_bit = int(np.ceil(np.log2(max_val)))
-        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
-        arguments = {
-            'n': n,
-            'dtype': dtype,
-            'max_val': max_val,
-        }
-        log_benchmark(arguments, mean_t, std_t)
-    @parameterized.parameters(*_BASELINE_SORT_TESTS)
-    def testTorchSort(self, n):
-        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
-        arguments = {
-            'n': n,
-        }
-        log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

 import numpy as np
 import torch
+# from absl.testing import parameterized
 from .. import ops
     print('=' * 60)
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

build/torch210-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops_test.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import unittest
 import itertools
 import torch
-from absl.testing import parameterized
 import stk
 from stk.ops.linear_ops_test import allclose, _dense_and_sparse
@@ -47,40 +47,40 @@ def _dense_and_sparse_like(x, std=0.1):
     return (dense.requires_grad_(True),
             sparse.requires_grad_(True))
-@parameterized.parameters(_ELTWISE_OP_TESTS)
-class EltwiseOpsTest(parameterized.TestCase):
-    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
-        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
-        b_dense, b = _dense_and_sparse_like(a)
-        out = stk.ops.mul(a, b)
-        expected_out = torch.mul(a_dense, b_dense)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        stk.ops.sum(out).backward()
-        # Validate the results.
-        out = stk.ops.to_dense(out)
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size(), out.size())
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = stk.ops.to_dense(a.grad)
-        expected_grad = a_dense.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size(), grad.size())
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad =  stk.ops.to_dense(b.grad)
-        expected_grad = b_dense.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size(), grad.size())
-        self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

 import unittest
 import itertools
 import torch
+# from absl.testing import parameterized
 import stk
 from stk.ops.linear_ops_test import allclose, _dense_and_sparse
     return (dense.requires_grad_(True),
             sparse.requires_grad_(True))
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

build/torch210-cxx11-cpu-x86_64-linux/stk/ops/linear_ops_test.py CHANGED Viewed

@@ -2,7 +2,7 @@ import unittest
 import itertools
 import numpy as np
 import torch
-from absl.testing import parameterized
 import stk
@@ -96,121 +96,121 @@ def _mask(x, mask):
     return x * mask
-@parameterized.parameters(*_LINEAR_OP_TESTS)
-class LinearOpsTest(parameterized.TestCase):
-    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
-        # Construct the operands.
-        a_shape = (k, m) if trans_a else (m, k)
-        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
-        b_shape = (n, k) if trans_b else (k, n)
-        b, bcp = _dense_2x(*b_shape, dtype)
-        # Execute the matmul.
-        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
-        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        out.sum().backward()
-        # Validate the results.
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size()[0], out.size()[0])
-        self.assertEqual(expected_out.size()[1], out.size()[1])
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = stk.ops.to_dense(a.grad)
-        expected_grad = _mask(a_dense.grad, a.grad)
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad = b.grad
-        expected_grad = bcp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
-        # Construct the operands.
-        a_shape = (k, m) if trans_a else (m, k)
-        a, acp = _dense_2x(*a_shape, dtype)
-        b_shape = (n, k) if trans_b else (k, n)
-        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
-        # Execute the matmul.
-        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
-        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        out.sum().backward()
-        # Validate the results.
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size()[0], out.size()[0])
-        self.assertEqual(expected_out.size()[1], out.size()[1])
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = a.grad
-        expected_grad = acp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad = stk.ops.to_dense(b.grad)
-        expected_grad = _mask(b_dense.grad, b.grad)
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
-        # Construct the operands.
-        a_shape = (k, m) if trans_a else (m, k)
-        a, acp = _dense_2x(*a_shape, dtype)
-        b_shape = (n, k) if trans_b else (k, n)
-        b, bcp = _dense_2x(*b_shape, dtype)
-        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
-        # Execute the matmul.
-        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
-        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        stk.ops.sum(out).backward()
-        # Validate the results.
-        out = stk.ops.to_dense(out)
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size()[0], out.size()[0])
-        self.assertEqual(expected_out.size()[1], out.size()[1])
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = a.grad
-        expected_grad = acp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad = b.grad
-        expected_grad = bcp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

 import itertools
 import numpy as np
 import torch
+# from absl.testing import parameterized
 import stk
     return x * mask
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

build/torch210-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops_test.py CHANGED Viewed

@@ -1,61 +1,61 @@
 import unittest
-from absl.testing import parameterized
 import stk
 import torch
-@parameterized.parameters(
-    (8, 16, 0.0, 1),
-    (8, 16, 0.5, 1),
-    (8, 16, .95, 1),
-    (16, 8, 0.0, 1),
-    (16, 8, 0.5, 1),
-    (16, 8, .95, 1),
-    (8, 16, 0.0, 8),
-    (8, 16, 0.5, 8),
-    (8, 16, 1.0, 8),
-    (16, 8, 0.0, 8),
-    (16, 8, 0.5, 8),
-    (16, 8, 1.0, 8),
-    (128, 256, 0.5, 16),
-    (256, 128, 0.75, 32),
-    (512, 512, .875, 128))
-class MatrixOpsTest(parameterized.TestCase):
-    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
-        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
-        x = (torch.randn(rows, cols) * mask).type(torch.float16)
-        # Convert the matrix to sparse format.
-        sparse_x = stk.ops.to_sparse(x, blocking)
-        # Validate the matrix.
-        sparse_x.validate()
-        # Validate the shape.
-        self.assertEqual(sparse_x.dim(), 2)
-        self.assertEqual(sparse_x.size()[0], rows)
-        self.assertEqual(sparse_x.size()[1], cols)
-        # Validate the sparsity.
-        numblocks = rows // blocking * cols // blocking
-        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
-        self.assertEqual(sparse_x.nnz, nnz)
-        # Convert back to dense format.
-        dense_x = stk.ops.to_dense(sparse_x)
-        # Validate the shape.
-        self.assertEqual(dense_x.dim(), 2)
-        self.assertEqual(dense_x.size()[0], rows)
-        self.assertEqual(dense_x.size()[1], cols)
-        # Validate the sparsity
-        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
-        # Validate the output.
-        self.assertTrue(torch.all(torch.eq(x, dense_x)))
 if __name__ == '__main__':

 import unittest
+# from absl.testing import parameterized
 import stk
 import torch
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
 if __name__ == '__main__':

build/torch210-cxx11-cpu-x86_64-linux/stk/random/random_ops_test.py CHANGED Viewed

@@ -1,72 +1,72 @@
 import unittest
-from absl.testing import parameterized
 from . import random
 import torch
-@parameterized.parameters(
-    (8, 16, 0.0, 1),
-    (8, 16, 0.5, 1),
-    (8, 16, .95, 1),
-    (16, 8, 0.0, 1),
-    (16, 8, 0.5, 1),
-    (16, 8, .95, 1),
-    (8, 16, 0.0, 8),
-    (8, 16, 0.5, 8),
-    (8, 16, 1.0, 8),
-    (16, 8, 0.0, 8),
-    (16, 8, 0.5, 8),
-    (16, 8, 1.0, 8),
-    (128, 256, 0.5, 16),
-    (256, 128, 0.75, 32),
-    (512, 512, .875, 128))
-class RandomOpsTest(parameterized.TestCase):
-    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
-        mask = random.dense_mask(
-            rows, cols, sparsity, blocking)
-        # Validate the shape.
-        self.assertEqual(mask.dim(), 2)
-        self.assertEqual(mask.size()[0], rows)
-        self.assertEqual(mask.size()[1], cols)
-        # Validate the sparsity
-        numblocks = rows // blocking * cols // blocking
-        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
-        self.assertEqual(
-            torch.count_nonzero(mask).item(),
-            nnz)
-        # Check values are zero or one.
-        self.assertTrue(
-            torch.all(torch.logical_or(
-                torch.eq(mask, 0),
-                torch.eq(mask, 1))))
-    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
-        mask = random.mask(
-            rows, cols, sparsity, blocking)
-        # Validate the matrix.
-        mask.validate()
-        # Validate the shape.
-        self.assertEqual(mask.dim(), 2)
-        self.assertEqual(mask.size()[0], rows)
-        self.assertEqual(mask.size()[1], cols)
-        # Validate the sparsity.
-        numblocks = rows // blocking * cols // blocking
-        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
-        self.assertEqual(mask.nnz, nnz)
-        # Check values are zero or one.
-        self.assertTrue(
-            torch.all(torch.logical_or(
-                torch.eq(mask.data, 0),
-                torch.eq(mask.data, 1))))
 if __name__ == '__main__':

 import unittest
+# from absl.testing import parameterized
 from . import random
 import torch
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
 if __name__ == '__main__':

build/torch210-cxx11-cu126-x86_64-linux/{_megablocks_cuda_6e04dec.abi3.so → _megablocks_cuda_a45325d.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:55948eae893317a5e500315e47efd66c4482bb67449caef3f512b2cabffb7dc6
 size 15061056

 version https://git-lfs.github.com/spec/v1
+oid sha256:a96ca4ac1ee02742edef4fb7f45497be39d31dc897f35a7c1a3663e1c41e050c
 size 15061056

build/torch210-cxx11-cu126-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_6e04dec
-ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_6e04dec::{op_name}"

 import torch
+from . import _megablocks_cuda_a45325d
+ops = torch.ops._megablocks_cuda_a45325d
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_a45325d::{op_name}"

build/torch210-cxx11-cu126-x86_64-linux/megablocks/__init__.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import ctypes
 import sys
-import importlib
 from pathlib import Path
 from types import ModuleType
 def _import_from_path(file_path: Path) -> ModuleType:
     # We cannot use the module name as-is, after adding it to `sys.modules`,
     # it would also be used for other imports. So, we make a module name that

 import ctypes
+import importlib.util
 import sys
 from pathlib import Path
 from types import ModuleType
 def _import_from_path(file_path: Path) -> ModuleType:
     # We cannot use the module name as-is, after adding it to `sys.modules`,
     # it would also be used for other imports. So, we make a module name that

build/torch210-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py CHANGED Viewed

@@ -5,7 +5,7 @@ import unittest
 import numpy as np
 import torch
-from absl.testing import parameterized
 from .. import ops
@@ -47,31 +47,31 @@ def log_benchmark(arguments, mean_t, std_t):
     print('=' * 60)
-class HistogramBenchmark(parameterized.TestCase):
-    @parameterized.parameters(*_HISTOGRAM_TESTS)
-    def testHistogram(self, n, dtype, max_val):
-        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
-        arguments = {
-            'n': n,
-            'dtype': dtype,
-            'max_val': max_val,
-        }
-        log_benchmark(arguments, mean_t, std_t)
-    @parameterized.parameters(*_HISTOGRAM_TESTS)
-    def testTorchHistogram(self, n, dtype, max_val):
-        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
-        arguments = {
-            'n': n,
-            'dtype': dtype,
-            'max_val': max_val,
-        }
-        log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

 import numpy as np
 import torch
+# from absl.testing import parameterized
 from .. import ops
     print('=' * 60)
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

build/torch210-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py CHANGED Viewed

@@ -17,7 +17,7 @@ import unittest
 from .. import stk
 import torch
-from absl.testing import parameterized
 from .. import benchmark_util, ops
@@ -48,367 +48,367 @@ def log_benchmark(name, arguments, time, std, flops):
     print('=' * 60)
-class MatmulBenchmark(parameterized.TestCase):
-    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
-        blocking = 128
-        padded_tokens, _ = x.size()
-        assert padded_tokens % blocking == 0
-        assert fhs % blocking == 0
-        # Offsets for the sparse matrix. All rows have the
-        # same number of nonzero blocks dictated by the
-        # dimensionality of a single expert.
-        block_rows = padded_tokens // blocking
-        blocks_per_row = fhs // blocking
-        offsets = torch.arange(
-            0,
-            block_rows * blocks_per_row + 1,
-            blocks_per_row,
-            dtype=torch.int32,
-            device=x.device,
-        )
-        # Indices for the sparse matrix. The indices for
-        # the intermediate matrix are dynamic depending
-        # on the mapping of tokens to experts.
-        column_indices = ops.topology(
-            padded_bins,
-            blocking,
-            block_rows,
-            blocks_per_row,
-        )
-        data = torch.empty(
-            column_indices.numel(),
-            blocking,
-            blocking,
-            dtype=torch.float16,
-            device=x.device,
-        )
-        shape = (padded_tokens, fhs * ne)
-        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
-        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
-    def build_input_matrix(self, sl, hs, ne):
-        x = torch.randn((sl, hs)).cuda().half()
-        # Assign tokens to experts uniformly.
-        top_expert = torch.arange(0, sl).cuda().int() % ne
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
-        return out, padded_bins
-    def build_weight_matrix(self, ne, hs, fhs):
-        return torch.randn((hs, ne * fhs)).cuda().half()
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        w = transpose_view(w)
-        def benchmark():
-            return stk.ops.sdd(x, w, topo)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::Fwd::SDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        def benchmark():
-            return stk.ops.dsd(topo, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::GradX::DSD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        topo = topo.t()
-        def benchmark():
-            return stk.ops.dsd(topo, x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::GradW::DSD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        def benchmark():
-            return stk.ops.dsd(x, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::Fwd::DSD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.nnz * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        out = stk.ops.dsd(x, w)
-        w = transpose_view(w)
-        def benchmark():
-            return stk.ops.sdd(out, w, x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradX::SDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.nnz * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        out = stk.ops.dsd(x, w)
-        x = x.t()
-        def benchmark():
-            return stk.ops.dsd(x, out)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradW::DSD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.nnz * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, hs)).cuda().half()
-        w = torch.randn((ne, hs, fhs)).cuda().half()
-        w = w.transpose(1, 2).contiguous()
-        w = w.transpose(1, 2)
-        def benchmark():
-            return torch.bmm(x, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::Fwd:DDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, hs)).cuda().half()
-        w = torch.randn((ne, hs, fhs)).cuda().half()
-        out = torch.bmm(x, w)
-        w = w.transpose(1, 2).contiguous()
-        def benchmark():
-            return torch.bmm(out, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0:GradX:DDD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, hs)).cuda().half()
-        w = torch.randn((ne, hs, fhs)).cuda().half()
-        out = torch.bmm(x, w)
-        out = out.transpose(1, 2)
-        def benchmark():
-            return torch.bmm(out, x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0:GradW:DDD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
-        w = torch.randn((ne, fhs, hs)).cuda().half()
-        def benchmark():
-            return torch.bmm(x, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::Fwd::DDD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
-        w = torch.randn((ne, fhs, hs)).cuda().half()
-        out = torch.bmm(x, w)
-        w = torch.transpose(w, 1, 2)
-        def benchmark():
-            return torch.bmm(out, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradX::DDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
-        w = torch.randn((ne, fhs, hs)).cuda().half()
-        out = torch.bmm(x, w)
-        x = torch.transpose(x, 1, 2)
-        def benchmark():
-            return torch.bmm(x, out)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradW::DDD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * hs * 2,
-        )
 if __name__ == '__main__':

 from .. import stk
 import torch
+# from absl.testing import parameterized
 from .. import benchmark_util, ops
     print('=' * 60)
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
 if __name__ == '__main__':

build/torch210-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py CHANGED Viewed

@@ -4,7 +4,7 @@
 import unittest
 import torch
-from absl.testing import parameterized
 from .. import benchmark_util, ops
@@ -16,50 +16,50 @@ _PADDED_SCATTER_BENCHMARK = (
 )
-class PaddedScatterTest(parameterized.TestCase):
-    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
-    def testPaddedScatter(self, sl, hs, ne, top_k):
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        # Randomly assign tokens to experts.
-        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        # Sample weights for the scatter reduce.
-        weights = torch.rand((sl * top_k,)).cuda().half()
-        # Gather the data to prepare for backwards.
-        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
-        def benchmark():
-            return ops.padded_scatter(
-                x,
-                indices,
-                bin_ids,
-                weights,
-                bins,
-                padded_bins,
-                top_k,
-            )
-        time, std = benchmark_util.benchmark_function(benchmark)
-        benchmark_util.log_benchmark(
-            'Padded Scatter',
-            {
-                'sequence_length': sl,
-                'hidden_size': hs,
-                'num_experts': ne,
-                'top_k': top_k,
-            },
-            time,
-            std,
-        )
 if __name__ == '__main__':

 import unittest
 import torch
+# from absl.testing import parameterized
 from .. import benchmark_util, ops
 )
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
 if __name__ == '__main__':

build/torch210-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py CHANGED Viewed

@@ -4,7 +4,7 @@
 import unittest
 import torch
-from absl.testing import parameterized
 from .. import benchmark_util, ops
@@ -26,123 +26,123 @@ _PERMUTE_TESTS = (
 )
-class PermuteBenchmark(parameterized.TestCase):
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testBinnedGather(self, sl, hs, ne):
-        # NOTE: Capacity factor == 1.
-        ec = sl // ne
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(indices, ne)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        def benchmark():
-            return ops.binned_gather(x, indices, bins, ec)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testBinnedScatter(self, sl, hs, ne):
-        # NOTE: Capacity factor == 1.
-        ec = sl // ne
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(indices, ne)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        x = ops.binned_gather(x, indices, bins, ec)
-        def benchmark():
-            return ops.binned_scatter(x, indices, bins)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testPaddedGather(self, sl, hs, ne):
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        # Randomly assign tokens to experts.
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        def benchmark():
-            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testPaddedScatter(self, sl, hs, ne):
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        # Randomly assign tokens to experts.
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
-        def benchmark():
-            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testCopy(self, sl, hs, ne):
-        # NOTE: Capacity factor == 1.
-        # ec = sl // ne
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        y = x.clone()
-        def benchmark():
-            return y.copy_(x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
 if __name__ == '__main__':

 import unittest
 import torch
+# from absl.testing import parameterized
 from .. import benchmark_util, ops
 )
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
 if __name__ == '__main__':

build/torch210-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py CHANGED Viewed

@@ -5,7 +5,7 @@ import unittest
 import numpy as np
 import torch
-from absl.testing import parameterized
 from .. import ops
@@ -53,32 +53,32 @@ def log_benchmark(arguments, mean_t, std_t):
     print('=' * 60)
-class SortBenchmark(parameterized.TestCase):
-    @parameterized.parameters(*_SORT_TESTS)
-    def testSort(self, n, dtype, max_val):
-        if max_val is None:
-            max_val = np.iinfo(numpy_dtype(dtype)).max
-        end_bit = int(np.ceil(np.log2(max_val)))
-        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
-        arguments = {
-            'n': n,
-            'dtype': dtype,
-            'max_val': max_val,
-        }
-        log_benchmark(arguments, mean_t, std_t)
-    @parameterized.parameters(*_BASELINE_SORT_TESTS)
-    def testTorchSort(self, n):
-        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
-        arguments = {
-            'n': n,
-        }
-        log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

 import numpy as np
 import torch
+# from absl.testing import parameterized
 from .. import ops
     print('=' * 60)
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

build/torch210-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import unittest
 import itertools
 import torch
-from absl.testing import parameterized
 import stk
 from stk.ops.linear_ops_test import allclose, _dense_and_sparse
@@ -47,40 +47,40 @@ def _dense_and_sparse_like(x, std=0.1):
     return (dense.requires_grad_(True),
             sparse.requires_grad_(True))
-@parameterized.parameters(_ELTWISE_OP_TESTS)
-class EltwiseOpsTest(parameterized.TestCase):
-    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
-        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
-        b_dense, b = _dense_and_sparse_like(a)
-        out = stk.ops.mul(a, b)
-        expected_out = torch.mul(a_dense, b_dense)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        stk.ops.sum(out).backward()
-        # Validate the results.
-        out = stk.ops.to_dense(out)
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size(), out.size())
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = stk.ops.to_dense(a.grad)
-        expected_grad = a_dense.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size(), grad.size())
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad =  stk.ops.to_dense(b.grad)
-        expected_grad = b_dense.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size(), grad.size())
-        self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

 import unittest
 import itertools
 import torch
+# from absl.testing import parameterized
 import stk
 from stk.ops.linear_ops_test import allclose, _dense_and_sparse
     return (dense.requires_grad_(True),
             sparse.requires_grad_(True))
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

build/torch210-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py CHANGED Viewed

@@ -2,7 +2,7 @@ import unittest
 import itertools
 import numpy as np
 import torch
-from absl.testing import parameterized
 import stk
@@ -96,121 +96,121 @@ def _mask(x, mask):
     return x * mask
-@parameterized.parameters(*_LINEAR_OP_TESTS)
-class LinearOpsTest(parameterized.TestCase):
-    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
-        # Construct the operands.
-        a_shape = (k, m) if trans_a else (m, k)
-        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
-        b_shape = (n, k) if trans_b else (k, n)
-        b, bcp = _dense_2x(*b_shape, dtype)
-        # Execute the matmul.
-        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
-        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        out.sum().backward()
-        # Validate the results.
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size()[0], out.size()[0])
-        self.assertEqual(expected_out.size()[1], out.size()[1])
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = stk.ops.to_dense(a.grad)
-        expected_grad = _mask(a_dense.grad, a.grad)
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad = b.grad
-        expected_grad = bcp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
-        # Construct the operands.
-        a_shape = (k, m) if trans_a else (m, k)
-        a, acp = _dense_2x(*a_shape, dtype)
-        b_shape = (n, k) if trans_b else (k, n)
-        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
-        # Execute the matmul.
-        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
-        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        out.sum().backward()
-        # Validate the results.
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size()[0], out.size()[0])
-        self.assertEqual(expected_out.size()[1], out.size()[1])
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = a.grad
-        expected_grad = acp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad = stk.ops.to_dense(b.grad)
-        expected_grad = _mask(b_dense.grad, b.grad)
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
-        # Construct the operands.
-        a_shape = (k, m) if trans_a else (m, k)
-        a, acp = _dense_2x(*a_shape, dtype)
-        b_shape = (n, k) if trans_b else (k, n)
-        b, bcp = _dense_2x(*b_shape, dtype)
-        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
-        # Execute the matmul.
-        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
-        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        stk.ops.sum(out).backward()
-        # Validate the results.
-        out = stk.ops.to_dense(out)
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size()[0], out.size()[0])
-        self.assertEqual(expected_out.size()[1], out.size()[1])
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = a.grad
-        expected_grad = acp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad = b.grad
-        expected_grad = bcp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

 import itertools
 import numpy as np
 import torch
+# from absl.testing import parameterized
 import stk
     return x * mask
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

build/torch210-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py CHANGED Viewed

@@ -1,61 +1,61 @@
 import unittest
-from absl.testing import parameterized
 import stk
 import torch
-@parameterized.parameters(
-    (8, 16, 0.0, 1),
-    (8, 16, 0.5, 1),
-    (8, 16, .95, 1),
-    (16, 8, 0.0, 1),
-    (16, 8, 0.5, 1),
-    (16, 8, .95, 1),
-    (8, 16, 0.0, 8),
-    (8, 16, 0.5, 8),
-    (8, 16, 1.0, 8),
-    (16, 8, 0.0, 8),
-    (16, 8, 0.5, 8),
-    (16, 8, 1.0, 8),
-    (128, 256, 0.5, 16),
-    (256, 128, 0.75, 32),
-    (512, 512, .875, 128))
-class MatrixOpsTest(parameterized.TestCase):
-    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
-        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
-        x = (torch.randn(rows, cols) * mask).type(torch.float16)
-        # Convert the matrix to sparse format.
-        sparse_x = stk.ops.to_sparse(x, blocking)
-        # Validate the matrix.
-        sparse_x.validate()
-        # Validate the shape.
-        self.assertEqual(sparse_x.dim(), 2)
-        self.assertEqual(sparse_x.size()[0], rows)
-        self.assertEqual(sparse_x.size()[1], cols)
-        # Validate the sparsity.
-        numblocks = rows // blocking * cols // blocking
-        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
-        self.assertEqual(sparse_x.nnz, nnz)
-        # Convert back to dense format.
-        dense_x = stk.ops.to_dense(sparse_x)
-        # Validate the shape.
-        self.assertEqual(dense_x.dim(), 2)
-        self.assertEqual(dense_x.size()[0], rows)
-        self.assertEqual(dense_x.size()[1], cols)
-        # Validate the sparsity
-        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
-        # Validate the output.
-        self.assertTrue(torch.all(torch.eq(x, dense_x)))
 if __name__ == '__main__':

 import unittest
+# from absl.testing import parameterized
 import stk
 import torch
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
 if __name__ == '__main__':

build/torch210-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py CHANGED Viewed

@@ -1,72 +1,72 @@
 import unittest
-from absl.testing import parameterized
 from . import random
 import torch
-@parameterized.parameters(
-    (8, 16, 0.0, 1),
-    (8, 16, 0.5, 1),
-    (8, 16, .95, 1),
-    (16, 8, 0.0, 1),
-    (16, 8, 0.5, 1),
-    (16, 8, .95, 1),
-    (8, 16, 0.0, 8),
-    (8, 16, 0.5, 8),
-    (8, 16, 1.0, 8),
-    (16, 8, 0.0, 8),
-    (16, 8, 0.5, 8),
-    (16, 8, 1.0, 8),
-    (128, 256, 0.5, 16),
-    (256, 128, 0.75, 32),
-    (512, 512, .875, 128))
-class RandomOpsTest(parameterized.TestCase):
-    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
-        mask = random.dense_mask(
-            rows, cols, sparsity, blocking)
-        # Validate the shape.
-        self.assertEqual(mask.dim(), 2)
-        self.assertEqual(mask.size()[0], rows)
-        self.assertEqual(mask.size()[1], cols)
-        # Validate the sparsity
-        numblocks = rows // blocking * cols // blocking
-        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
-        self.assertEqual(
-            torch.count_nonzero(mask).item(),
-            nnz)
-        # Check values are zero or one.
-        self.assertTrue(
-            torch.all(torch.logical_or(
-                torch.eq(mask, 0),
-                torch.eq(mask, 1))))
-    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
-        mask = random.mask(
-            rows, cols, sparsity, blocking)
-        # Validate the matrix.
-        mask.validate()
-        # Validate the shape.
-        self.assertEqual(mask.dim(), 2)
-        self.assertEqual(mask.size()[0], rows)
-        self.assertEqual(mask.size()[1], cols)
-        # Validate the sparsity.
-        numblocks = rows // blocking * cols // blocking
-        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
-        self.assertEqual(mask.nnz, nnz)
-        # Check values are zero or one.
-        self.assertTrue(
-            torch.all(torch.logical_or(
-                torch.eq(mask.data, 0),
-                torch.eq(mask.data, 1))))
 if __name__ == '__main__':

 import unittest
+# from absl.testing import parameterized
 from . import random
 import torch
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
 if __name__ == '__main__':

build/torch210-cxx11-cu128-x86_64-linux/{_megablocks_cuda_6e04dec.abi3.so → _megablocks_cuda_a45325d.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6e66fd44576448dc82e7392db0c935cd8654bfcb51db51ddc044e1c33bc82c60
 size 21009984

 version https://git-lfs.github.com/spec/v1
+oid sha256:e8b110fed233d0db0bef3df539cab1487191a578b89bae5b3fba3f39262f827f
 size 21009984

build/torch210-cxx11-cu128-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_6e04dec
-ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_6e04dec::{op_name}"

 import torch
+from . import _megablocks_cuda_a45325d
+ops = torch.ops._megablocks_cuda_a45325d
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_a45325d::{op_name}"

build/torch210-cxx11-cu128-x86_64-linux/megablocks/__init__.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import ctypes
 import sys
-import importlib
 from pathlib import Path
 from types import ModuleType
 def _import_from_path(file_path: Path) -> ModuleType:
     # We cannot use the module name as-is, after adding it to `sys.modules`,
     # it would also be used for other imports. So, we make a module name that

 import ctypes
+import importlib.util
 import sys
 from pathlib import Path
 from types import ModuleType
 def _import_from_path(file_path: Path) -> ModuleType:
     # We cannot use the module name as-is, after adding it to `sys.modules`,
     # it would also be used for other imports. So, we make a module name that

build/torch210-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py CHANGED Viewed

@@ -5,7 +5,7 @@ import unittest
 import numpy as np
 import torch
-from absl.testing import parameterized
 from .. import ops
@@ -47,31 +47,31 @@ def log_benchmark(arguments, mean_t, std_t):
     print('=' * 60)
-class HistogramBenchmark(parameterized.TestCase):
-    @parameterized.parameters(*_HISTOGRAM_TESTS)
-    def testHistogram(self, n, dtype, max_val):
-        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
-        arguments = {
-            'n': n,
-            'dtype': dtype,
-            'max_val': max_val,
-        }
-        log_benchmark(arguments, mean_t, std_t)
-    @parameterized.parameters(*_HISTOGRAM_TESTS)
-    def testTorchHistogram(self, n, dtype, max_val):
-        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
-        arguments = {
-            'n': n,
-            'dtype': dtype,
-            'max_val': max_val,
-        }
-        log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

 import numpy as np
 import torch
+# from absl.testing import parameterized
 from .. import ops
     print('=' * 60)
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

build/torch210-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py CHANGED Viewed

@@ -17,7 +17,7 @@ import unittest
 from .. import stk
 import torch
-from absl.testing import parameterized
 from .. import benchmark_util, ops
@@ -48,367 +48,367 @@ def log_benchmark(name, arguments, time, std, flops):
     print('=' * 60)
-class MatmulBenchmark(parameterized.TestCase):
-    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
-        blocking = 128
-        padded_tokens, _ = x.size()
-        assert padded_tokens % blocking == 0
-        assert fhs % blocking == 0
-        # Offsets for the sparse matrix. All rows have the
-        # same number of nonzero blocks dictated by the
-        # dimensionality of a single expert.
-        block_rows = padded_tokens // blocking
-        blocks_per_row = fhs // blocking
-        offsets = torch.arange(
-            0,
-            block_rows * blocks_per_row + 1,
-            blocks_per_row,
-            dtype=torch.int32,
-            device=x.device,
-        )
-        # Indices for the sparse matrix. The indices for
-        # the intermediate matrix are dynamic depending
-        # on the mapping of tokens to experts.
-        column_indices = ops.topology(
-            padded_bins,
-            blocking,
-            block_rows,
-            blocks_per_row,
-        )
-        data = torch.empty(
-            column_indices.numel(),
-            blocking,
-            blocking,
-            dtype=torch.float16,
-            device=x.device,
-        )
-        shape = (padded_tokens, fhs * ne)
-        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
-        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
-    def build_input_matrix(self, sl, hs, ne):
-        x = torch.randn((sl, hs)).cuda().half()
-        # Assign tokens to experts uniformly.
-        top_expert = torch.arange(0, sl).cuda().int() % ne
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
-        return out, padded_bins
-    def build_weight_matrix(self, ne, hs, fhs):
-        return torch.randn((hs, ne * fhs)).cuda().half()
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        w = transpose_view(w)
-        def benchmark():
-            return stk.ops.sdd(x, w, topo)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::Fwd::SDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        def benchmark():
-            return stk.ops.dsd(topo, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::GradX::DSD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        topo = topo.t()
-        def benchmark():
-            return stk.ops.dsd(topo, x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::GradW::DSD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        def benchmark():
-            return stk.ops.dsd(x, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::Fwd::DSD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.nnz * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        out = stk.ops.dsd(x, w)
-        w = transpose_view(w)
-        def benchmark():
-            return stk.ops.sdd(out, w, x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradX::SDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.nnz * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        out = stk.ops.dsd(x, w)
-        x = x.t()
-        def benchmark():
-            return stk.ops.dsd(x, out)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradW::DSD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.nnz * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, hs)).cuda().half()
-        w = torch.randn((ne, hs, fhs)).cuda().half()
-        w = w.transpose(1, 2).contiguous()
-        w = w.transpose(1, 2)
-        def benchmark():
-            return torch.bmm(x, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::Fwd:DDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, hs)).cuda().half()
-        w = torch.randn((ne, hs, fhs)).cuda().half()
-        out = torch.bmm(x, w)
-        w = w.transpose(1, 2).contiguous()
-        def benchmark():
-            return torch.bmm(out, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0:GradX:DDD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, hs)).cuda().half()
-        w = torch.randn((ne, hs, fhs)).cuda().half()
-        out = torch.bmm(x, w)
-        out = out.transpose(1, 2)
-        def benchmark():
-            return torch.bmm(out, x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0:GradW:DDD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
-        w = torch.randn((ne, fhs, hs)).cuda().half()
-        def benchmark():
-            return torch.bmm(x, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::Fwd::DDD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
-        w = torch.randn((ne, fhs, hs)).cuda().half()
-        out = torch.bmm(x, w)
-        w = torch.transpose(w, 1, 2)
-        def benchmark():
-            return torch.bmm(out, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradX::DDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
-        w = torch.randn((ne, fhs, hs)).cuda().half()
-        out = torch.bmm(x, w)
-        x = torch.transpose(x, 1, 2)
-        def benchmark():
-            return torch.bmm(x, out)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradW::DDD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * hs * 2,
-        )
 if __name__ == '__main__':

 from .. import stk
 import torch
+# from absl.testing import parameterized
 from .. import benchmark_util, ops
     print('=' * 60)
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
 if __name__ == '__main__':

build/torch210-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py CHANGED Viewed

@@ -4,7 +4,7 @@
 import unittest
 import torch
-from absl.testing import parameterized
 from .. import benchmark_util, ops
@@ -16,50 +16,50 @@ _PADDED_SCATTER_BENCHMARK = (
 )
-class PaddedScatterTest(parameterized.TestCase):
-    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
-    def testPaddedScatter(self, sl, hs, ne, top_k):
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        # Randomly assign tokens to experts.
-        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        # Sample weights for the scatter reduce.
-        weights = torch.rand((sl * top_k,)).cuda().half()
-        # Gather the data to prepare for backwards.
-        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
-        def benchmark():
-            return ops.padded_scatter(
-                x,
-                indices,
-                bin_ids,
-                weights,
-                bins,
-                padded_bins,
-                top_k,
-            )
-        time, std = benchmark_util.benchmark_function(benchmark)
-        benchmark_util.log_benchmark(
-            'Padded Scatter',
-            {
-                'sequence_length': sl,
-                'hidden_size': hs,
-                'num_experts': ne,
-                'top_k': top_k,
-            },
-            time,
-            std,
-        )
 if __name__ == '__main__':

 import unittest
 import torch
+# from absl.testing import parameterized
 from .. import benchmark_util, ops
 )
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
 if __name__ == '__main__':

build/torch210-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py CHANGED Viewed

@@ -4,7 +4,7 @@
 import unittest
 import torch
-from absl.testing import parameterized
 from .. import benchmark_util, ops
@@ -26,123 +26,123 @@ _PERMUTE_TESTS = (
 )
-class PermuteBenchmark(parameterized.TestCase):
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testBinnedGather(self, sl, hs, ne):
-        # NOTE: Capacity factor == 1.
-        ec = sl // ne
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(indices, ne)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        def benchmark():
-            return ops.binned_gather(x, indices, bins, ec)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testBinnedScatter(self, sl, hs, ne):
-        # NOTE: Capacity factor == 1.
-        ec = sl // ne
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(indices, ne)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        x = ops.binned_gather(x, indices, bins, ec)
-        def benchmark():
-            return ops.binned_scatter(x, indices, bins)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testPaddedGather(self, sl, hs, ne):
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        # Randomly assign tokens to experts.
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        def benchmark():
-            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testPaddedScatter(self, sl, hs, ne):
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        # Randomly assign tokens to experts.
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
-        def benchmark():
-            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testCopy(self, sl, hs, ne):
-        # NOTE: Capacity factor == 1.
-        # ec = sl // ne
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        y = x.clone()
-        def benchmark():
-            return y.copy_(x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
 if __name__ == '__main__':

 import unittest
 import torch
+# from absl.testing import parameterized
 from .. import benchmark_util, ops
 )
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
 if __name__ == '__main__':

build/torch210-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py CHANGED Viewed

@@ -5,7 +5,7 @@ import unittest
 import numpy as np
 import torch
-from absl.testing import parameterized
 from .. import ops
@@ -53,32 +53,32 @@ def log_benchmark(arguments, mean_t, std_t):
     print('=' * 60)
-class SortBenchmark(parameterized.TestCase):
-    @parameterized.parameters(*_SORT_TESTS)
-    def testSort(self, n, dtype, max_val):
-        if max_val is None:
-            max_val = np.iinfo(numpy_dtype(dtype)).max
-        end_bit = int(np.ceil(np.log2(max_val)))
-        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
-        arguments = {
-            'n': n,
-            'dtype': dtype,
-            'max_val': max_val,
-        }
-        log_benchmark(arguments, mean_t, std_t)
-    @parameterized.parameters(*_BASELINE_SORT_TESTS)
-    def testTorchSort(self, n):
-        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
-        arguments = {
-            'n': n,
-        }
-        log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

 import numpy as np
 import torch
+# from absl.testing import parameterized
 from .. import ops
     print('=' * 60)
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

build/torch210-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import unittest
 import itertools
 import torch
-from absl.testing import parameterized
 import stk
 from stk.ops.linear_ops_test import allclose, _dense_and_sparse
@@ -47,40 +47,40 @@ def _dense_and_sparse_like(x, std=0.1):
     return (dense.requires_grad_(True),
             sparse.requires_grad_(True))
-@parameterized.parameters(_ELTWISE_OP_TESTS)
-class EltwiseOpsTest(parameterized.TestCase):
-    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
-        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
-        b_dense, b = _dense_and_sparse_like(a)
-        out = stk.ops.mul(a, b)
-        expected_out = torch.mul(a_dense, b_dense)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        stk.ops.sum(out).backward()
-        # Validate the results.
-        out = stk.ops.to_dense(out)
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size(), out.size())
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = stk.ops.to_dense(a.grad)
-        expected_grad = a_dense.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size(), grad.size())
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad =  stk.ops.to_dense(b.grad)
-        expected_grad = b_dense.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size(), grad.size())
-        self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

 import unittest
 import itertools
 import torch
+# from absl.testing import parameterized
 import stk
 from stk.ops.linear_ops_test import allclose, _dense_and_sparse
     return (dense.requires_grad_(True),
             sparse.requires_grad_(True))
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

build/torch210-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py CHANGED Viewed

@@ -2,7 +2,7 @@ import unittest
 import itertools
 import numpy as np
 import torch
-from absl.testing import parameterized
 import stk
@@ -96,121 +96,121 @@ def _mask(x, mask):
     return x * mask
-@parameterized.parameters(*_LINEAR_OP_TESTS)
-class LinearOpsTest(parameterized.TestCase):
-    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
-        # Construct the operands.
-        a_shape = (k, m) if trans_a else (m, k)
-        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
-        b_shape = (n, k) if trans_b else (k, n)
-        b, bcp = _dense_2x(*b_shape, dtype)
-        # Execute the matmul.
-        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
-        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        out.sum().backward()
-        # Validate the results.
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size()[0], out.size()[0])
-        self.assertEqual(expected_out.size()[1], out.size()[1])
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = stk.ops.to_dense(a.grad)
-        expected_grad = _mask(a_dense.grad, a.grad)
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad = b.grad
-        expected_grad = bcp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
-        # Construct the operands.
-        a_shape = (k, m) if trans_a else (m, k)
-        a, acp = _dense_2x(*a_shape, dtype)
-        b_shape = (n, k) if trans_b else (k, n)
-        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
-        # Execute the matmul.
-        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
-        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        out.sum().backward()
-        # Validate the results.
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size()[0], out.size()[0])
-        self.assertEqual(expected_out.size()[1], out.size()[1])
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = a.grad
-        expected_grad = acp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad = stk.ops.to_dense(b.grad)
-        expected_grad = _mask(b_dense.grad, b.grad)
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
-        # Construct the operands.
-        a_shape = (k, m) if trans_a else (m, k)
-        a, acp = _dense_2x(*a_shape, dtype)
-        b_shape = (n, k) if trans_b else (k, n)
-        b, bcp = _dense_2x(*b_shape, dtype)
-        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
-        # Execute the matmul.
-        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
-        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        stk.ops.sum(out).backward()
-        # Validate the results.
-        out = stk.ops.to_dense(out)
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size()[0], out.size()[0])
-        self.assertEqual(expected_out.size()[1], out.size()[1])
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = a.grad
-        expected_grad = acp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad = b.grad
-        expected_grad = bcp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

 import itertools
 import numpy as np
 import torch
+# from absl.testing import parameterized
 import stk
     return x * mask
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

build/torch210-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py CHANGED Viewed

@@ -1,61 +1,61 @@
 import unittest
-from absl.testing import parameterized
 import stk
 import torch
-@parameterized.parameters(
-    (8, 16, 0.0, 1),
-    (8, 16, 0.5, 1),
-    (8, 16, .95, 1),
-    (16, 8, 0.0, 1),
-    (16, 8, 0.5, 1),
-    (16, 8, .95, 1),
-    (8, 16, 0.0, 8),
-    (8, 16, 0.5, 8),
-    (8, 16, 1.0, 8),
-    (16, 8, 0.0, 8),
-    (16, 8, 0.5, 8),
-    (16, 8, 1.0, 8),
-    (128, 256, 0.5, 16),
-    (256, 128, 0.75, 32),
-    (512, 512, .875, 128))
-class MatrixOpsTest(parameterized.TestCase):
-    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
-        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
-        x = (torch.randn(rows, cols) * mask).type(torch.float16)
-        # Convert the matrix to sparse format.
-        sparse_x = stk.ops.to_sparse(x, blocking)
-        # Validate the matrix.
-        sparse_x.validate()
-        # Validate the shape.
-        self.assertEqual(sparse_x.dim(), 2)
-        self.assertEqual(sparse_x.size()[0], rows)
-        self.assertEqual(sparse_x.size()[1], cols)
-        # Validate the sparsity.
-        numblocks = rows // blocking * cols // blocking
-        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
-        self.assertEqual(sparse_x.nnz, nnz)
-        # Convert back to dense format.
-        dense_x = stk.ops.to_dense(sparse_x)
-        # Validate the shape.
-        self.assertEqual(dense_x.dim(), 2)
-        self.assertEqual(dense_x.size()[0], rows)
-        self.assertEqual(dense_x.size()[1], cols)
-        # Validate the sparsity
-        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
-        # Validate the output.
-        self.assertTrue(torch.all(torch.eq(x, dense_x)))
 if __name__ == '__main__':

 import unittest
+# from absl.testing import parameterized
 import stk
 import torch
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
 if __name__ == '__main__':

build/torch210-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py CHANGED Viewed

@@ -1,72 +1,72 @@
 import unittest
-from absl.testing import parameterized
 from . import random
 import torch
-@parameterized.parameters(
-    (8, 16, 0.0, 1),
-    (8, 16, 0.5, 1),
-    (8, 16, .95, 1),
-    (16, 8, 0.0, 1),
-    (16, 8, 0.5, 1),
-    (16, 8, .95, 1),
-    (8, 16, 0.0, 8),
-    (8, 16, 0.5, 8),
-    (8, 16, 1.0, 8),
-    (16, 8, 0.0, 8),
-    (16, 8, 0.5, 8),
-    (16, 8, 1.0, 8),
-    (128, 256, 0.5, 16),
-    (256, 128, 0.75, 32),
-    (512, 512, .875, 128))
-class RandomOpsTest(parameterized.TestCase):
-    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
-        mask = random.dense_mask(
-            rows, cols, sparsity, blocking)
-        # Validate the shape.
-        self.assertEqual(mask.dim(), 2)
-        self.assertEqual(mask.size()[0], rows)
-        self.assertEqual(mask.size()[1], cols)
-        # Validate the sparsity
-        numblocks = rows // blocking * cols // blocking
-        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
-        self.assertEqual(
-            torch.count_nonzero(mask).item(),
-            nnz)
-        # Check values are zero or one.
-        self.assertTrue(
-            torch.all(torch.logical_or(
-                torch.eq(mask, 0),
-                torch.eq(mask, 1))))
-    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
-        mask = random.mask(
-            rows, cols, sparsity, blocking)
-        # Validate the matrix.
-        mask.validate()
-        # Validate the shape.
-        self.assertEqual(mask.dim(), 2)
-        self.assertEqual(mask.size()[0], rows)
-        self.assertEqual(mask.size()[1], cols)
-        # Validate the sparsity.
-        numblocks = rows // blocking * cols // blocking
-        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
-        self.assertEqual(mask.nnz, nnz)
-        # Check values are zero or one.
-        self.assertTrue(
-            torch.all(torch.logical_or(
-                torch.eq(mask.data, 0),
-                torch.eq(mask.data, 1))))
 if __name__ == '__main__':

 import unittest
+# from absl.testing import parameterized
 from . import random
 import torch
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
 if __name__ == '__main__':

build/torch210-cxx11-cu130-x86_64-linux/{_megablocks_cuda_6e04dec.abi3.so → _megablocks_cuda_a45325d.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ed503a781293a9d6150e0362edbe9360ef6e58590b511ee23596649ee9a437d
 size 12041592

 version https://git-lfs.github.com/spec/v1
+oid sha256:391ee51a42c7bf87472426a9291154d2c9cf2f32be7826a24e09a0e7fd192e4c
 size 12041592

build/torch210-cxx11-cu130-x86_64-linux/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_cuda_6e04dec
-ops = torch.ops._megablocks_cuda_6e04dec
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_cuda_6e04dec::{op_name}"

 import torch
+from . import _megablocks_cuda_a45325d
+ops = torch.ops._megablocks_cuda_a45325d
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_cuda_a45325d::{op_name}"

build/torch210-cxx11-cu130-x86_64-linux/megablocks/__init__.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import ctypes
 import sys
-import importlib
 from pathlib import Path
 from types import ModuleType
 def _import_from_path(file_path: Path) -> ModuleType:
     # We cannot use the module name as-is, after adding it to `sys.modules`,
     # it would also be used for other imports. So, we make a module name that

 import ctypes
+import importlib.util
 import sys
 from pathlib import Path
 from types import ModuleType
 def _import_from_path(file_path: Path) -> ModuleType:
     # We cannot use the module name as-is, after adding it to `sys.modules`,
     # it would also be used for other imports. So, we make a module name that

build/torch210-cxx11-cu130-x86_64-linux/ops/histogram_benchmark.py CHANGED Viewed

@@ -5,7 +5,7 @@ import unittest
 import numpy as np
 import torch
-from absl.testing import parameterized
 from .. import ops
@@ -47,31 +47,31 @@ def log_benchmark(arguments, mean_t, std_t):
     print('=' * 60)
-class HistogramBenchmark(parameterized.TestCase):
-    @parameterized.parameters(*_HISTOGRAM_TESTS)
-    def testHistogram(self, n, dtype, max_val):
-        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
-        arguments = {
-            'n': n,
-            'dtype': dtype,
-            'max_val': max_val,
-        }
-        log_benchmark(arguments, mean_t, std_t)
-    @parameterized.parameters(*_HISTOGRAM_TESTS)
-    def testTorchHistogram(self, n, dtype, max_val):
-        x = torch.randint(0, 128, (n,)).cuda().to(dtype)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
-        arguments = {
-            'n': n,
-            'dtype': dtype,
-            'max_val': max_val,
-        }
-        log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

 import numpy as np
 import torch
+# from absl.testing import parameterized
 from .. import ops
     print('=' * 60)
+# class HistogramBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_HISTOGRAM_TESTS)
+#     def testTorchHistogram(self, n, dtype, max_val):
+#         x = torch.randint(0, 128, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

build/torch210-cxx11-cu130-x86_64-linux/ops/matmul_benchmark.py CHANGED Viewed

@@ -17,7 +17,7 @@ import unittest
 from .. import stk
 import torch
-from absl.testing import parameterized
 from .. import benchmark_util, ops
@@ -48,367 +48,367 @@ def log_benchmark(name, arguments, time, std, flops):
     print('=' * 60)
-class MatmulBenchmark(parameterized.TestCase):
-    def build_sparse_matrix(self, x, padded_bins, fhs, ne):
-        blocking = 128
-        padded_tokens, _ = x.size()
-        assert padded_tokens % blocking == 0
-        assert fhs % blocking == 0
-        # Offsets for the sparse matrix. All rows have the
-        # same number of nonzero blocks dictated by the
-        # dimensionality of a single expert.
-        block_rows = padded_tokens // blocking
-        blocks_per_row = fhs // blocking
-        offsets = torch.arange(
-            0,
-            block_rows * blocks_per_row + 1,
-            blocks_per_row,
-            dtype=torch.int32,
-            device=x.device,
-        )
-        # Indices for the sparse matrix. The indices for
-        # the intermediate matrix are dynamic depending
-        # on the mapping of tokens to experts.
-        column_indices = ops.topology(
-            padded_bins,
-            blocking,
-            block_rows,
-            blocks_per_row,
-        )
-        data = torch.empty(
-            column_indices.numel(),
-            blocking,
-            blocking,
-            dtype=torch.float16,
-            device=x.device,
-        )
-        shape = (padded_tokens, fhs * ne)
-        row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
-        return stk.Matrix(shape, data, row_indices, column_indices, offsets)
-    def build_input_matrix(self, sl, hs, ne):
-        x = torch.randn((sl, hs)).cuda().half()
-        # Assign tokens to experts uniformly.
-        top_expert = torch.arange(0, sl).cuda().int() % ne
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
-        return out, padded_bins
-    def build_weight_matrix(self, ne, hs, fhs):
-        return torch.randn((hs, ne * fhs)).cuda().half()
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        w = transpose_view(w)
-        def benchmark():
-            return stk.ops.sdd(x, w, topo)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::Fwd::SDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        def benchmark():
-            return stk.ops.dsd(topo, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::GradX::DSD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        topo = topo.t()
-        def benchmark():
-            return stk.ops.dsd(topo, x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::GradW::DSD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        def benchmark():
-            return stk.ops.dsd(x, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::Fwd::DSD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.nnz * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        out = stk.ops.dsd(x, w)
-        w = transpose_view(w)
-        def benchmark():
-            return stk.ops.sdd(out, w, x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradX::SDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.nnz * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
-        x, padded_bins = self.build_input_matrix(sl, hs, ne)
-        w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
-        x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
-        out = stk.ops.dsd(x, w)
-        x = x.t()
-        def benchmark():
-            return stk.ops.dsd(x, out)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradW::DSD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.nnz * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, hs)).cuda().half()
-        w = torch.randn((ne, hs, fhs)).cuda().half()
-        w = w.transpose(1, 2).contiguous()
-        w = w.transpose(1, 2)
-        def benchmark():
-            return torch.bmm(x, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0::Fwd:DDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, hs)).cuda().half()
-        w = torch.randn((ne, hs, fhs)).cuda().half()
-        out = torch.bmm(x, w)
-        w = w.transpose(1, 2).contiguous()
-        def benchmark():
-            return torch.bmm(out, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0:GradX:DDD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, hs)).cuda().half()
-        w = torch.randn((ne, hs, fhs)).cuda().half()
-        out = torch.bmm(x, w)
-        out = out.transpose(1, 2)
-        def benchmark():
-            return torch.bmm(out, x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '0:GradW:DDD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * fhs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
-        w = torch.randn((ne, fhs, hs)).cuda().half()
-        def benchmark():
-            return torch.bmm(x, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::Fwd::DDD::NN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
-        w = torch.randn((ne, fhs, hs)).cuda().half()
-        out = torch.bmm(x, w)
-        w = torch.transpose(w, 1, 2)
-        def benchmark():
-            return torch.bmm(out, w)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradX::DDD::NT',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * hs * 2,
-        )
-    @parameterized.parameters(*_MATMUL_TESTS)
-    def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
-        assert (sl % ne) == 0
-        x = torch.randn((ne, sl // ne, fhs)).cuda().half()
-        w = torch.randn((ne, fhs, hs)).cuda().half()
-        out = torch.bmm(x, w)
-        x = torch.transpose(x, 1, 2)
-        def benchmark():
-            return torch.bmm(x, out)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'ffn_hidden_size': fhs,
-            'num_experts': ne,
-        }
-        log_benchmark(
-            '1::GradW::DDD::TN',
-            arguments,
-            mean_t,
-            std_t,
-            x.numel() * hs * 2,
-        )
 if __name__ == '__main__':

 from .. import stk
 import torch
+# from absl.testing import parameterized
 from .. import benchmark_util, ops
     print('=' * 60)
+# class MatmulBenchmark(parameterized.TestCase):
+#
+#     def build_sparse_matrix(self, x, padded_bins, fhs, ne):
+#         blocking = 128
+#         padded_tokens, _ = x.size()
+#         assert padded_tokens % blocking == 0
+#         assert fhs % blocking == 0
+#
+#         # Offsets for the sparse matrix. All rows have the
+#         # same number of nonzero blocks dictated by the
+#         # dimensionality of a single expert.
+#         block_rows = padded_tokens // blocking
+#         blocks_per_row = fhs // blocking
+#         offsets = torch.arange(
+#             0,
+#             block_rows * blocks_per_row + 1,
+#             blocks_per_row,
+#             dtype=torch.int32,
+#             device=x.device,
+#         )
+#
+#         # Indices for the sparse matrix. The indices for
+#         # the intermediate matrix are dynamic depending
+#         # on the mapping of tokens to experts.
+#         column_indices = ops.topology(
+#             padded_bins,
+#             blocking,
+#             block_rows,
+#             blocks_per_row,
+#         )
+#         data = torch.empty(
+#             column_indices.numel(),
+#             blocking,
+#             blocking,
+#             dtype=torch.float16,
+#             device=x.device,
+#         )
+#         shape = (padded_tokens, fhs * ne)
+#         row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
+#         return stk.Matrix(shape, data, row_indices, column_indices, offsets)
+#
+#     def build_input_matrix(self, sl, hs, ne):
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Assign tokens to experts uniformly.
+#         top_expert = torch.arange(0, sl).cuda().int() % ne
+#
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
+#         return out, padded_bins
+#
+#     def build_weight_matrix(self, ne, hs, fhs):
+#         return torch.randn((hs, ne * fhs)).cuda().half()
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(x, w, topo)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradX::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         topo = topo.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(topo, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DSD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         w = transpose_view(w)
+#
+#         def benchmark():
+#             return stk.ops.sdd(out, w, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::SDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
+#         x, padded_bins = self.build_input_matrix(sl, hs, ne)
+#         w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
+#         x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
+#         out = stk.ops.dsd(x, w)
+#         x = x.t()
+#
+#         def benchmark():
+#             return stk.ops.dsd(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DSD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.nnz * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#
+#         w = w.transpose(1, 2).contiguous()
+#         w = w.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0::Fwd:DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = w.transpose(1, 2).contiguous()
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradX:DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, hs)).cuda().half()
+#         w = torch.randn((ne, hs, fhs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         out = out.transpose(1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '0:GradW:DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * fhs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#
+#         def benchmark():
+#             return torch.bmm(x, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::Fwd::DDD::NN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         w = torch.transpose(w, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(out, w)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradX::DDD::NT',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
+#
+#     @parameterized.parameters(*_MATMUL_TESTS)
+#     def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
+#         assert (sl % ne) == 0
+#         x = torch.randn((ne, sl // ne, fhs)).cuda().half()
+#         w = torch.randn((ne, fhs, hs)).cuda().half()
+#         out = torch.bmm(x, w)
+#         x = torch.transpose(x, 1, 2)
+#
+#         def benchmark():
+#             return torch.bmm(x, out)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'ffn_hidden_size': fhs,
+#             'num_experts': ne,
+#         }
+#         log_benchmark(
+#             '1::GradW::DDD::TN',
+#             arguments,
+#             mean_t,
+#             std_t,
+#             x.numel() * hs * 2,
+#         )
 if __name__ == '__main__':

build/torch210-cxx11-cu130-x86_64-linux/ops/padded_scatter_benchmark.py CHANGED Viewed

@@ -4,7 +4,7 @@
 import unittest
 import torch
-from absl.testing import parameterized
 from .. import benchmark_util, ops
@@ -16,50 +16,50 @@ _PADDED_SCATTER_BENCHMARK = (
 )
-class PaddedScatterTest(parameterized.TestCase):
-    @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
-    def testPaddedScatter(self, sl, hs, ne, top_k):
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        # Randomly assign tokens to experts.
-        top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        # Sample weights for the scatter reduce.
-        weights = torch.rand((sl * top_k,)).cuda().half()
-        # Gather the data to prepare for backwards.
-        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
-        def benchmark():
-            return ops.padded_scatter(
-                x,
-                indices,
-                bin_ids,
-                weights,
-                bins,
-                padded_bins,
-                top_k,
-            )
-        time, std = benchmark_util.benchmark_function(benchmark)
-        benchmark_util.log_benchmark(
-            'Padded Scatter',
-            {
-                'sequence_length': sl,
-                'hidden_size': hs,
-                'num_experts': ne,
-                'top_k': top_k,
-            },
-            time,
-            std,
-        )
 if __name__ == '__main__':

 import unittest
 import torch
+# from absl.testing import parameterized
 from .. import benchmark_util, ops
 )
+# class PaddedScatterTest(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
+#     def testPaddedScatter(self, sl, hs, ne, top_k):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         # Sample weights for the scatter reduce.
+#         weights = torch.rand((sl * top_k,)).cuda().half()
+#
+#         # Gather the data to prepare for backwards.
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+#
+#         def benchmark():
+#             return ops.padded_scatter(
+#                 x,
+#                 indices,
+#                 bin_ids,
+#                 weights,
+#                 bins,
+#                 padded_bins,
+#                 top_k,
+#             )
+#
+#         time, std = benchmark_util.benchmark_function(benchmark)
+#         benchmark_util.log_benchmark(
+#             'Padded Scatter',
+#             {
+#                 'sequence_length': sl,
+#                 'hidden_size': hs,
+#                 'num_experts': ne,
+#                 'top_k': top_k,
+#             },
+#             time,
+#             std,
+#         )
 if __name__ == '__main__':

build/torch210-cxx11-cu130-x86_64-linux/ops/permute_benchmark.py CHANGED Viewed

@@ -4,7 +4,7 @@
 import unittest
 import torch
-from absl.testing import parameterized
 from .. import benchmark_util, ops
@@ -26,123 +26,123 @@ _PERMUTE_TESTS = (
 )
-class PermuteBenchmark(parameterized.TestCase):
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testBinnedGather(self, sl, hs, ne):
-        # NOTE: Capacity factor == 1.
-        ec = sl // ne
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(indices, ne)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        def benchmark():
-            return ops.binned_gather(x, indices, bins, ec)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testBinnedScatter(self, sl, hs, ne):
-        # NOTE: Capacity factor == 1.
-        ec = sl // ne
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(indices, ne)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        x = ops.binned_gather(x, indices, bins, ec)
-        def benchmark():
-            return ops.binned_scatter(x, indices, bins)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testPaddedGather(self, sl, hs, ne):
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        # Randomly assign tokens to experts.
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        def benchmark():
-            return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testPaddedScatter(self, sl, hs, ne):
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        # Randomly assign tokens to experts.
-        top_expert = torch.randint(0, ne, (sl,)).cuda().int()
-        bin_ids, indices = ops.sort(top_expert)
-        tokens_per_expert = ops.histogram(top_expert, ne)
-        padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
-        padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
-        x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
-        def benchmark():
-            return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
-    @parameterized.parameters(*_PERMUTE_TESTS)
-    def testCopy(self, sl, hs, ne):
-        # NOTE: Capacity factor == 1.
-        # ec = sl // ne
-        # Create the data and indices.
-        x = torch.randn((sl, hs)).cuda().half()
-        y = x.clone()
-        def benchmark():
-            return y.copy_(x)
-        mean_t, std_t = benchmark_util.benchmark_function(benchmark)
-        arguments = {
-            'sequence_length': sl,
-            'hidden_size': hs,
-            'num_experts': ne,
-        }
-        benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
 if __name__ == '__main__':

 import unittest
 import torch
+# from absl.testing import parameterized
 from .. import benchmark_util, ops
 )
+# class PermuteBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedGather(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.binned_gather(x, indices, bins, ec)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testBinnedScatter(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(indices, ne)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.binned_gather(x, indices, bins, ec)
+#
+#         def benchmark():
+#             return ops.binned_scatter(x, indices, bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedGather(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#
+#         def benchmark():
+#             return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testPaddedScatter(self, sl, hs, ne):
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#
+#         # Randomly assign tokens to experts.
+#         top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+#         bin_ids, indices = ops.sort(top_expert)
+#         tokens_per_expert = ops.histogram(top_expert, ne)
+#         padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+#         padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+#         bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+#         x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
+#
+#         def benchmark():
+#             return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_PERMUTE_TESTS)
+#     def testCopy(self, sl, hs, ne):
+#         # NOTE: Capacity factor == 1.
+#         # ec = sl // ne
+#
+#         # Create the data and indices.
+#         x = torch.randn((sl, hs)).cuda().half()
+#         y = x.clone()
+#
+#         def benchmark():
+#             return y.copy_(x)
+#
+#         mean_t, std_t = benchmark_util.benchmark_function(benchmark)
+#         arguments = {
+#             'sequence_length': sl,
+#             'hidden_size': hs,
+#             'num_experts': ne,
+#         }
+#         benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
 if __name__ == '__main__':

build/torch210-cxx11-cu130-x86_64-linux/ops/sort_benchmark.py CHANGED Viewed

@@ -5,7 +5,7 @@ import unittest
 import numpy as np
 import torch
-from absl.testing import parameterized
 from .. import ops
@@ -53,32 +53,32 @@ def log_benchmark(arguments, mean_t, std_t):
     print('=' * 60)
-class SortBenchmark(parameterized.TestCase):
-    @parameterized.parameters(*_SORT_TESTS)
-    def testSort(self, n, dtype, max_val):
-        if max_val is None:
-            max_val = np.iinfo(numpy_dtype(dtype)).max
-        end_bit = int(np.ceil(np.log2(max_val)))
-        x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
-        arguments = {
-            'n': n,
-            'dtype': dtype,
-            'max_val': max_val,
-        }
-        log_benchmark(arguments, mean_t, std_t)
-    @parameterized.parameters(*_BASELINE_SORT_TESTS)
-    def testTorchSort(self, n):
-        x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
-        mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
-        arguments = {
-            'n': n,
-        }
-        log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

 import numpy as np
 import torch
+# from absl.testing import parameterized
 from .. import ops
     print('=' * 60)
+# class SortBenchmark(parameterized.TestCase):
+#
+#     @parameterized.parameters(*_SORT_TESTS)
+#     def testSort(self, n, dtype, max_val):
+#         if max_val is None:
+#             max_val = np.iinfo(numpy_dtype(dtype)).max
+#         end_bit = int(np.ceil(np.log2(max_val)))
+#         x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
+#         arguments = {
+#             'n': n,
+#             'dtype': dtype,
+#             'max_val': max_val,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
+#
+#     @parameterized.parameters(*_BASELINE_SORT_TESTS)
+#     def testTorchSort(self, n):
+#         x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
+#
+#         mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
+#         arguments = {
+#             'n': n,
+#         }
+#         log_benchmark(arguments, mean_t, std_t)
 if __name__ == '__main__':

build/torch210-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops_test.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import unittest
 import itertools
 import torch
-from absl.testing import parameterized
 import stk
 from stk.ops.linear_ops_test import allclose, _dense_and_sparse
@@ -47,40 +47,40 @@ def _dense_and_sparse_like(x, std=0.1):
     return (dense.requires_grad_(True),
             sparse.requires_grad_(True))
-@parameterized.parameters(_ELTWISE_OP_TESTS)
-class EltwiseOpsTest(parameterized.TestCase):
-    def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
-        a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
-        b_dense, b = _dense_and_sparse_like(a)
-        out = stk.ops.mul(a, b)
-        expected_out = torch.mul(a_dense, b_dense)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        stk.ops.sum(out).backward()
-        # Validate the results.
-        out = stk.ops.to_dense(out)
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size(), out.size())
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = stk.ops.to_dense(a.grad)
-        expected_grad = a_dense.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size(), grad.size())
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad =  stk.ops.to_dense(b.grad)
-        expected_grad = b_dense.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size(), grad.size())
-        self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

 import unittest
 import itertools
 import torch
+# from absl.testing import parameterized
 import stk
 from stk.ops.linear_ops_test import allclose, _dense_and_sparse
     return (dense.requires_grad_(True),
             sparse.requires_grad_(True))
+# @parameterized.parameters(_ELTWISE_OP_TESTS)
+# class EltwiseOpsTest(parameterized.TestCase):
+#
+#     def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
+#
+#         a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#         b_dense, b = _dense_and_sparse_like(a)
+#
+#         out = stk.ops.mul(a, b)
+#         expected_out = torch.mul(a_dense, b_dense)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size(), out.size())
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = a_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad =  stk.ops.to_dense(b.grad)
+#         expected_grad = b_dense.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size(), grad.size())
+#         self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

build/torch210-cxx11-cu130-x86_64-linux/stk/ops/linear_ops_test.py CHANGED Viewed

@@ -2,7 +2,7 @@ import unittest
 import itertools
 import numpy as np
 import torch
-from absl.testing import parameterized
 import stk
@@ -96,121 +96,121 @@ def _mask(x, mask):
     return x * mask
-@parameterized.parameters(*_LINEAR_OP_TESTS)
-class LinearOpsTest(parameterized.TestCase):
-    def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
-        # Construct the operands.
-        a_shape = (k, m) if trans_a else (m, k)
-        a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
-        b_shape = (n, k) if trans_b else (k, n)
-        b, bcp = _dense_2x(*b_shape, dtype)
-        # Execute the matmul.
-        out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
-        expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        out.sum().backward()
-        # Validate the results.
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size()[0], out.size()[0])
-        self.assertEqual(expected_out.size()[1], out.size()[1])
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = stk.ops.to_dense(a.grad)
-        expected_grad = _mask(a_dense.grad, a.grad)
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad = b.grad
-        expected_grad = bcp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-    def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
-        # Construct the operands.
-        a_shape = (k, m) if trans_a else (m, k)
-        a, acp = _dense_2x(*a_shape, dtype)
-        b_shape = (n, k) if trans_b else (k, n)
-        b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
-        # Execute the matmul.
-        out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
-        expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        out.sum().backward()
-        # Validate the results.
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size()[0], out.size()[0])
-        self.assertEqual(expected_out.size()[1], out.size()[1])
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = a.grad
-        expected_grad = acp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad = stk.ops.to_dense(b.grad)
-        expected_grad = _mask(b_dense.grad, b.grad)
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-    def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
-        # Construct the operands.
-        a_shape = (k, m) if trans_a else (m, k)
-        a, acp = _dense_2x(*a_shape, dtype)
-        b_shape = (n, k) if trans_b else (k, n)
-        b, bcp = _dense_2x(*b_shape, dtype)
-        _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
-        # Execute the matmul.
-        out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
-        expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
-        # Compute the gradients w.r.t. the inputs.
-        expected_out.sum().backward()
-        stk.ops.sum(out).backward()
-        # Validate the results.
-        out = stk.ops.to_dense(out)
-        self.assertEqual(out.dim(), 2)
-        self.assertEqual(expected_out.size()[0], out.size()[0])
-        self.assertEqual(expected_out.size()[1], out.size()[1])
-        self.assertTrue(allclose(out, expected_out))
-        # LHS gradient.
-        grad = a.grad
-        expected_grad = acp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
-        # RHS gradient.
-        grad = b.grad
-        expected_grad = bcp.grad
-        self.assertEqual(grad.dim(), 2)
-        self.assertEqual(expected_grad.size()[0], grad.size()[0])
-        self.assertEqual(expected_grad.size()[1], grad.size()[1])
-        self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

 import itertools
 import numpy as np
 import torch
+# from absl.testing import parameterized
 import stk
     return x * mask
+# @parameterized.parameters(*_LINEAR_OP_TESTS)
+# class LinearOpsTest(parameterized.TestCase):
+#
+#     def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = stk.ops.to_dense(a.grad)
+#         expected_grad = _mask(a_dense.grad, a.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
+#         expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         out.sum().backward()
+#
+#         # Validate the results.
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = stk.ops.to_dense(b.grad)
+#         expected_grad = _mask(b_dense.grad, b.grad)
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#     def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
+#         # Construct the operands.
+#         a_shape = (k, m) if trans_a else (m, k)
+#         a, acp = _dense_2x(*a_shape, dtype)
+#         b_shape = (n, k) if trans_b else (k, n)
+#         b, bcp = _dense_2x(*b_shape, dtype)
+#         _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
+#
+#         # Execute the matmul.
+#         out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
+#         expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
+#
+#         # Compute the gradients w.r.t. the inputs.
+#         expected_out.sum().backward()
+#         stk.ops.sum(out).backward()
+#
+#         # Validate the results.
+#         out = stk.ops.to_dense(out)
+#         self.assertEqual(out.dim(), 2)
+#         self.assertEqual(expected_out.size()[0], out.size()[0])
+#         self.assertEqual(expected_out.size()[1], out.size()[1])
+#         self.assertTrue(allclose(out, expected_out))
+#
+#         # LHS gradient.
+#         grad = a.grad
+#         expected_grad = acp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
+#
+#         # RHS gradient.
+#         grad = b.grad
+#         expected_grad = bcp.grad
+#         self.assertEqual(grad.dim(), 2)
+#         self.assertEqual(expected_grad.size()[0], grad.size()[0])
+#         self.assertEqual(expected_grad.size()[1], grad.size()[1])
+#         self.assertTrue(allclose(grad, expected_grad))
 if __name__ == '__main__':
     unittest.main()

build/torch210-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops_test.py CHANGED Viewed

@@ -1,61 +1,61 @@
 import unittest
-from absl.testing import parameterized
 import stk
 import torch
-@parameterized.parameters(
-    (8, 16, 0.0, 1),
-    (8, 16, 0.5, 1),
-    (8, 16, .95, 1),
-    (16, 8, 0.0, 1),
-    (16, 8, 0.5, 1),
-    (16, 8, .95, 1),
-    (8, 16, 0.0, 8),
-    (8, 16, 0.5, 8),
-    (8, 16, 1.0, 8),
-    (16, 8, 0.0, 8),
-    (16, 8, 0.5, 8),
-    (16, 8, 1.0, 8),
-    (128, 256, 0.5, 16),
-    (256, 128, 0.75, 32),
-    (512, 512, .875, 128))
-class MatrixOpsTest(parameterized.TestCase):
-    def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
-        mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
-        x = (torch.randn(rows, cols) * mask).type(torch.float16)
-        # Convert the matrix to sparse format.
-        sparse_x = stk.ops.to_sparse(x, blocking)
-        # Validate the matrix.
-        sparse_x.validate()
-        # Validate the shape.
-        self.assertEqual(sparse_x.dim(), 2)
-        self.assertEqual(sparse_x.size()[0], rows)
-        self.assertEqual(sparse_x.size()[1], cols)
-        # Validate the sparsity.
-        numblocks = rows // blocking * cols // blocking
-        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
-        self.assertEqual(sparse_x.nnz, nnz)
-        # Convert back to dense format.
-        dense_x = stk.ops.to_dense(sparse_x)
-        # Validate the shape.
-        self.assertEqual(dense_x.dim(), 2)
-        self.assertEqual(dense_x.size()[0], rows)
-        self.assertEqual(dense_x.size()[1], cols)
-        # Validate the sparsity
-        self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
-        # Validate the output.
-        self.assertTrue(torch.all(torch.eq(x, dense_x)))
 if __name__ == '__main__':

 import unittest
+# from absl.testing import parameterized
 import stk
 import torch
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class MatrixOpsTest(parameterized.TestCase):
+#
+#     def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
+#         mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
+#         x = (torch.randn(rows, cols) * mask).type(torch.float16)
+#
+#         # Convert the matrix to sparse format.
+#         sparse_x = stk.ops.to_sparse(x, blocking)
+#
+#         # Validate the matrix.
+#         sparse_x.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(sparse_x.dim(), 2)
+#         self.assertEqual(sparse_x.size()[0], rows)
+#         self.assertEqual(sparse_x.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(sparse_x.nnz, nnz)
+#
+#         # Convert back to dense format.
+#         dense_x = stk.ops.to_dense(sparse_x)
+#
+#         # Validate the shape.
+#         self.assertEqual(dense_x.dim(), 2)
+#         self.assertEqual(dense_x.size()[0], rows)
+#         self.assertEqual(dense_x.size()[1], cols)
+#
+#         # Validate the sparsity
+#         self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
+#
+#         # Validate the output.
+#         self.assertTrue(torch.all(torch.eq(x, dense_x)))
 if __name__ == '__main__':

build/torch210-cxx11-cu130-x86_64-linux/stk/random/random_ops_test.py CHANGED Viewed

@@ -1,72 +1,72 @@
 import unittest
-from absl.testing import parameterized
 from . import random
 import torch
-@parameterized.parameters(
-    (8, 16, 0.0, 1),
-    (8, 16, 0.5, 1),
-    (8, 16, .95, 1),
-    (16, 8, 0.0, 1),
-    (16, 8, 0.5, 1),
-    (16, 8, .95, 1),
-    (8, 16, 0.0, 8),
-    (8, 16, 0.5, 8),
-    (8, 16, 1.0, 8),
-    (16, 8, 0.0, 8),
-    (16, 8, 0.5, 8),
-    (16, 8, 1.0, 8),
-    (128, 256, 0.5, 16),
-    (256, 128, 0.75, 32),
-    (512, 512, .875, 128))
-class RandomOpsTest(parameterized.TestCase):
-    def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
-        mask = random.dense_mask(
-            rows, cols, sparsity, blocking)
-        # Validate the shape.
-        self.assertEqual(mask.dim(), 2)
-        self.assertEqual(mask.size()[0], rows)
-        self.assertEqual(mask.size()[1], cols)
-        # Validate the sparsity
-        numblocks = rows // blocking * cols // blocking
-        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
-        self.assertEqual(
-            torch.count_nonzero(mask).item(),
-            nnz)
-        # Check values are zero or one.
-        self.assertTrue(
-            torch.all(torch.logical_or(
-                torch.eq(mask, 0),
-                torch.eq(mask, 1))))
-    def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
-        mask = random.mask(
-            rows, cols, sparsity, blocking)
-        # Validate the matrix.
-        mask.validate()
-        # Validate the shape.
-        self.assertEqual(mask.dim(), 2)
-        self.assertEqual(mask.size()[0], rows)
-        self.assertEqual(mask.size()[1], cols)
-        # Validate the sparsity.
-        numblocks = rows // blocking * cols // blocking
-        nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
-        self.assertEqual(mask.nnz, nnz)
-        # Check values are zero or one.
-        self.assertTrue(
-            torch.all(torch.logical_or(
-                torch.eq(mask.data, 0),
-                torch.eq(mask.data, 1))))
 if __name__ == '__main__':

 import unittest
+# from absl.testing import parameterized
 from . import random
 import torch
+# @parameterized.parameters(
+#     (8, 16, 0.0, 1),
+#     (8, 16, 0.5, 1),
+#     (8, 16, .95, 1),
+#     (16, 8, 0.0, 1),
+#     (16, 8, 0.5, 1),
+#     (16, 8, .95, 1),
+#     (8, 16, 0.0, 8),
+#     (8, 16, 0.5, 8),
+#     (8, 16, 1.0, 8),
+#     (16, 8, 0.0, 8),
+#     (16, 8, 0.5, 8),
+#     (16, 8, 1.0, 8),
+#     (128, 256, 0.5, 16),
+#     (256, 128, 0.75, 32),
+#     (512, 512, .875, 128))
+# class RandomOpsTest(parameterized.TestCase):
+#
+#     def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.dense_mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(
+#             torch.count_nonzero(mask).item(),
+#             nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask, 0),
+#                 torch.eq(mask, 1))))
+#
+#     def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
+#         mask = random.mask(
+#             rows, cols, sparsity, blocking)
+#
+#         # Validate the matrix.
+#         mask.validate()
+#
+#         # Validate the shape.
+#         self.assertEqual(mask.dim(), 2)
+#         self.assertEqual(mask.size()[0], rows)
+#         self.assertEqual(mask.size()[1], cols)
+#
+#         # Validate the sparsity.
+#         numblocks = rows // blocking * cols // blocking
+#         nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
+#         self.assertEqual(mask.nnz, nnz)
+#
+#         # Check values are zero or one.
+#         self.assertTrue(
+#             torch.all(torch.logical_or(
+#                 torch.eq(mask.data, 0),
+#                 torch.eq(mask.data, 1))))
 if __name__ == '__main__':

build/torch210-cxx11-xpu20253-x86_64-linux/{_megablocks_xpu_6e04dec.abi3.so → _megablocks_xpu_a45325d.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:46cfa6050944b0bd6daeaf4848fe5393a68397ae29a5c7f0a04280e287cb0e7d
 size 5381760

 version https://git-lfs.github.com/spec/v1
+oid sha256:929e28d44de28c212187ee2c71b8427c84a7372157ee7bc815e7e0e1941a9f40
 size 5381760