Kernels
danieldk HF Staff commited on
Commit
f9dfc57
·
verified ·
1 Parent(s): 578605e

Build uploaded using `kernels`.

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. build/torch210-cxx11-cpu-x86_64-linux/{_megablocks_cpu_6e04dec.abi3.so → _megablocks_cpu_a45325d.abi3.so} +1 -1
  2. build/torch210-cxx11-cpu-x86_64-linux/_ops.py +3 -3
  3. build/torch210-cxx11-cpu-x86_64-linux/megablocks/__init__.py +2 -2
  4. build/torch210-cxx11-cpu-x86_64-linux/metadata.json +4 -1
  5. build/torch210-cxx11-cpu-x86_64-linux/ops/histogram_benchmark.py +26 -26
  6. build/torch210-cxx11-cpu-x86_64-linux/ops/matmul_benchmark.py +362 -362
  7. build/torch210-cxx11-cpu-x86_64-linux/ops/padded_scatter_benchmark.py +45 -45
  8. build/torch210-cxx11-cpu-x86_64-linux/ops/permute_benchmark.py +118 -118
  9. build/torch210-cxx11-cpu-x86_64-linux/ops/sort_benchmark.py +27 -27
  10. build/torch210-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops_test.py +35 -35
  11. build/torch210-cxx11-cpu-x86_64-linux/stk/ops/linear_ops_test.py +116 -116
  12. build/torch210-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops_test.py +52 -52
  13. build/torch210-cxx11-cpu-x86_64-linux/stk/random/random_ops_test.py +63 -63
  14. build/torch210-cxx11-cu126-x86_64-linux/{_megablocks_cuda_6e04dec.abi3.so → _megablocks_cuda_a45325d.abi3.so} +1 -1
  15. build/torch210-cxx11-cu126-x86_64-linux/_ops.py +3 -3
  16. build/torch210-cxx11-cu126-x86_64-linux/megablocks/__init__.py +2 -2
  17. build/torch210-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py +26 -26
  18. build/torch210-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py +362 -362
  19. build/torch210-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py +45 -45
  20. build/torch210-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py +118 -118
  21. build/torch210-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py +27 -27
  22. build/torch210-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py +35 -35
  23. build/torch210-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py +116 -116
  24. build/torch210-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py +52 -52
  25. build/torch210-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py +63 -63
  26. build/torch210-cxx11-cu128-x86_64-linux/{_megablocks_cuda_6e04dec.abi3.so → _megablocks_cuda_a45325d.abi3.so} +1 -1
  27. build/torch210-cxx11-cu128-x86_64-linux/_ops.py +3 -3
  28. build/torch210-cxx11-cu128-x86_64-linux/megablocks/__init__.py +2 -2
  29. build/torch210-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py +26 -26
  30. build/torch210-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py +362 -362
  31. build/torch210-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py +45 -45
  32. build/torch210-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py +118 -118
  33. build/torch210-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py +27 -27
  34. build/torch210-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py +35 -35
  35. build/torch210-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py +116 -116
  36. build/torch210-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py +52 -52
  37. build/torch210-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py +63 -63
  38. build/torch210-cxx11-cu130-x86_64-linux/{_megablocks_cuda_6e04dec.abi3.so → _megablocks_cuda_a45325d.abi3.so} +1 -1
  39. build/torch210-cxx11-cu130-x86_64-linux/_ops.py +3 -3
  40. build/torch210-cxx11-cu130-x86_64-linux/megablocks/__init__.py +2 -2
  41. build/torch210-cxx11-cu130-x86_64-linux/ops/histogram_benchmark.py +26 -26
  42. build/torch210-cxx11-cu130-x86_64-linux/ops/matmul_benchmark.py +362 -362
  43. build/torch210-cxx11-cu130-x86_64-linux/ops/padded_scatter_benchmark.py +45 -45
  44. build/torch210-cxx11-cu130-x86_64-linux/ops/permute_benchmark.py +118 -118
  45. build/torch210-cxx11-cu130-x86_64-linux/ops/sort_benchmark.py +27 -27
  46. build/torch210-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops_test.py +35 -35
  47. build/torch210-cxx11-cu130-x86_64-linux/stk/ops/linear_ops_test.py +116 -116
  48. build/torch210-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops_test.py +52 -52
  49. build/torch210-cxx11-cu130-x86_64-linux/stk/random/random_ops_test.py +63 -63
  50. build/torch210-cxx11-xpu20253-x86_64-linux/{_megablocks_xpu_6e04dec.abi3.so → _megablocks_xpu_a45325d.abi3.so} +1 -1
build/torch210-cxx11-cpu-x86_64-linux/{_megablocks_cpu_6e04dec.abi3.so → _megablocks_cpu_a45325d.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70b79b772262fee7ee79153a54dc208c9166f4c34680f752b7bc2ce8d8ae1f74
3
  size 2219080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef67276bfac31793120c3afb0e2579d50f8af875102a253f92ac8f170eec604b
3
  size 2219080
build/torch210-cxx11-cpu-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _megablocks_cpu_6e04dec
3
- ops = torch.ops._megablocks_cpu_6e04dec
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_megablocks_cpu_6e04dec::{op_name}"
 
1
  import torch
2
+ from . import _megablocks_cpu_a45325d
3
+ ops = torch.ops._megablocks_cpu_a45325d
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_megablocks_cpu_a45325d::{op_name}"
build/torch210-cxx11-cpu-x86_64-linux/megablocks/__init__.py CHANGED
@@ -1,10 +1,10 @@
1
  import ctypes
 
2
  import sys
3
-
4
- import importlib
5
  from pathlib import Path
6
  from types import ModuleType
7
 
 
8
  def _import_from_path(file_path: Path) -> ModuleType:
9
  # We cannot use the module name as-is, after adding it to `sys.modules`,
10
  # it would also be used for other imports. So, we make a module name that
 
1
  import ctypes
2
+ import importlib.util
3
  import sys
 
 
4
  from pathlib import Path
5
  from types import ModuleType
6
 
7
+
8
  def _import_from_path(file_path: Path) -> ModuleType:
9
  # We cannot use the module name as-is, after adding it to `sys.modules`,
10
  # it would also be used for other imports. So, we make a module name that
build/torch210-cxx11-cpu-x86_64-linux/metadata.json CHANGED
@@ -1,5 +1,8 @@
1
  {
2
  "version": 1,
3
  "license": "Apache-2.0",
4
- "python-depends": []
 
 
 
5
  }
 
1
  {
2
  "version": 1,
3
  "license": "Apache-2.0",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "cpu"
7
+ }
8
  }
build/torch210-cxx11-cpu-x86_64-linux/ops/histogram_benchmark.py CHANGED
@@ -5,7 +5,7 @@ import unittest
5
 
6
  import numpy as np
7
  import torch
8
- from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
@@ -47,31 +47,31 @@ def log_benchmark(arguments, mean_t, std_t):
47
  print('=' * 60)
48
 
49
 
50
- class HistogramBenchmark(parameterized.TestCase):
51
-
52
- @parameterized.parameters(*_HISTOGRAM_TESTS)
53
- def testHistogram(self, n, dtype, max_val):
54
- x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
55
-
56
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
57
- arguments = {
58
- 'n': n,
59
- 'dtype': dtype,
60
- 'max_val': max_val,
61
- }
62
- log_benchmark(arguments, mean_t, std_t)
63
-
64
- @parameterized.parameters(*_HISTOGRAM_TESTS)
65
- def testTorchHistogram(self, n, dtype, max_val):
66
- x = torch.randint(0, 128, (n,)).cuda().to(dtype)
67
-
68
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
69
- arguments = {
70
- 'n': n,
71
- 'dtype': dtype,
72
- 'max_val': max_val,
73
- }
74
- log_benchmark(arguments, mean_t, std_t)
75
 
76
 
77
  if __name__ == '__main__':
 
5
 
6
  import numpy as np
7
  import torch
8
+ # from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
 
47
  print('=' * 60)
48
 
49
 
50
+ # class HistogramBenchmark(parameterized.TestCase):
51
+ #
52
+ # @parameterized.parameters(*_HISTOGRAM_TESTS)
53
+ # def testHistogram(self, n, dtype, max_val):
54
+ # x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
55
+ #
56
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
57
+ # arguments = {
58
+ # 'n': n,
59
+ # 'dtype': dtype,
60
+ # 'max_val': max_val,
61
+ # }
62
+ # log_benchmark(arguments, mean_t, std_t)
63
+ #
64
+ # @parameterized.parameters(*_HISTOGRAM_TESTS)
65
+ # def testTorchHistogram(self, n, dtype, max_val):
66
+ # x = torch.randint(0, 128, (n,)).cuda().to(dtype)
67
+ #
68
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
69
+ # arguments = {
70
+ # 'n': n,
71
+ # 'dtype': dtype,
72
+ # 'max_val': max_val,
73
+ # }
74
+ # log_benchmark(arguments, mean_t, std_t)
75
 
76
 
77
  if __name__ == '__main__':
build/torch210-cxx11-cpu-x86_64-linux/ops/matmul_benchmark.py CHANGED
@@ -17,7 +17,7 @@ import unittest
17
  from .. import stk
18
 
19
  import torch
20
- from absl.testing import parameterized
21
 
22
  from .. import benchmark_util, ops
23
 
@@ -48,367 +48,367 @@ def log_benchmark(name, arguments, time, std, flops):
48
  print('=' * 60)
49
 
50
 
51
- class MatmulBenchmark(parameterized.TestCase):
52
-
53
- def build_sparse_matrix(self, x, padded_bins, fhs, ne):
54
- blocking = 128
55
- padded_tokens, _ = x.size()
56
- assert padded_tokens % blocking == 0
57
- assert fhs % blocking == 0
58
-
59
- # Offsets for the sparse matrix. All rows have the
60
- # same number of nonzero blocks dictated by the
61
- # dimensionality of a single expert.
62
- block_rows = padded_tokens // blocking
63
- blocks_per_row = fhs // blocking
64
- offsets = torch.arange(
65
- 0,
66
- block_rows * blocks_per_row + 1,
67
- blocks_per_row,
68
- dtype=torch.int32,
69
- device=x.device,
70
- )
71
-
72
- # Indices for the sparse matrix. The indices for
73
- # the intermediate matrix are dynamic depending
74
- # on the mapping of tokens to experts.
75
- column_indices = ops.topology(
76
- padded_bins,
77
- blocking,
78
- block_rows,
79
- blocks_per_row,
80
- )
81
- data = torch.empty(
82
- column_indices.numel(),
83
- blocking,
84
- blocking,
85
- dtype=torch.float16,
86
- device=x.device,
87
- )
88
- shape = (padded_tokens, fhs * ne)
89
- row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
90
- return stk.Matrix(shape, data, row_indices, column_indices, offsets)
91
-
92
- def build_input_matrix(self, sl, hs, ne):
93
- x = torch.randn((sl, hs)).cuda().half()
94
-
95
- # Assign tokens to experts uniformly.
96
- top_expert = torch.arange(0, sl).cuda().int() % ne
97
-
98
- bin_ids, indices = ops.sort(top_expert)
99
- tokens_per_expert = ops.histogram(top_expert, ne)
100
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
101
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
102
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
103
- out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
104
- return out, padded_bins
105
-
106
- def build_weight_matrix(self, ne, hs, fhs):
107
- return torch.randn((hs, ne * fhs)).cuda().half()
108
-
109
- @parameterized.parameters(*_MATMUL_TESTS)
110
- def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
111
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
112
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
113
- topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
114
- w = transpose_view(w)
115
-
116
- def benchmark():
117
- return stk.ops.sdd(x, w, topo)
118
-
119
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
- arguments = {
121
- 'sequence_length': sl,
122
- 'hidden_size': hs,
123
- 'ffn_hidden_size': fhs,
124
- 'num_experts': ne,
125
- }
126
- log_benchmark(
127
- '0::Fwd::SDD::NT',
128
- arguments,
129
- mean_t,
130
- std_t,
131
- x.numel() * fhs * 2,
132
- )
133
-
134
- @parameterized.parameters(*_MATMUL_TESTS)
135
- def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
136
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
137
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
138
- topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
139
-
140
- def benchmark():
141
- return stk.ops.dsd(topo, w)
142
-
143
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
144
- arguments = {
145
- 'sequence_length': sl,
146
- 'hidden_size': hs,
147
- 'ffn_hidden_size': fhs,
148
- 'num_experts': ne,
149
- }
150
- log_benchmark(
151
- '0::GradX::DSD::NN',
152
- arguments,
153
- mean_t,
154
- std_t,
155
- x.numel() * fhs * 2,
156
- )
157
-
158
- @parameterized.parameters(*_MATMUL_TESTS)
159
- def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
160
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
161
- topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
162
- topo = topo.t()
163
-
164
- def benchmark():
165
- return stk.ops.dsd(topo, x)
166
-
167
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
168
- arguments = {
169
- 'sequence_length': sl,
170
- 'hidden_size': hs,
171
- 'ffn_hidden_size': fhs,
172
- 'num_experts': ne,
173
- }
174
- log_benchmark(
175
- '0::GradW::DSD::TN',
176
- arguments,
177
- mean_t,
178
- std_t,
179
- x.numel() * fhs * 2,
180
- )
181
-
182
- @parameterized.parameters(*_MATMUL_TESTS)
183
- def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
184
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
185
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
186
- x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
187
-
188
- def benchmark():
189
- return stk.ops.dsd(x, w)
190
-
191
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
192
- arguments = {
193
- 'sequence_length': sl,
194
- 'hidden_size': hs,
195
- 'ffn_hidden_size': fhs,
196
- 'num_experts': ne,
197
- }
198
- log_benchmark(
199
- '1::Fwd::DSD::NN',
200
- arguments,
201
- mean_t,
202
- std_t,
203
- x.nnz * hs * 2,
204
- )
205
-
206
- @parameterized.parameters(*_MATMUL_TESTS)
207
- def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
208
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
209
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
210
- x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
211
- out = stk.ops.dsd(x, w)
212
- w = transpose_view(w)
213
-
214
- def benchmark():
215
- return stk.ops.sdd(out, w, x)
216
-
217
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
218
- arguments = {
219
- 'sequence_length': sl,
220
- 'hidden_size': hs,
221
- 'ffn_hidden_size': fhs,
222
- 'num_experts': ne,
223
- }
224
- log_benchmark(
225
- '1::GradX::SDD::NT',
226
- arguments,
227
- mean_t,
228
- std_t,
229
- x.nnz * hs * 2,
230
- )
231
-
232
- @parameterized.parameters(*_MATMUL_TESTS)
233
- def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
234
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
235
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
236
- x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
237
- out = stk.ops.dsd(x, w)
238
- x = x.t()
239
-
240
- def benchmark():
241
- return stk.ops.dsd(x, out)
242
-
243
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
244
- arguments = {
245
- 'sequence_length': sl,
246
- 'hidden_size': hs,
247
- 'ffn_hidden_size': fhs,
248
- 'num_experts': ne,
249
- }
250
- log_benchmark(
251
- '1::GradW::DSD::TN',
252
- arguments,
253
- mean_t,
254
- std_t,
255
- x.nnz * hs * 2,
256
- )
257
-
258
- @parameterized.parameters(*_MATMUL_TESTS)
259
- def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
260
- assert (sl % ne) == 0
261
- x = torch.randn((ne, sl // ne, hs)).cuda().half()
262
- w = torch.randn((ne, hs, fhs)).cuda().half()
263
-
264
- w = w.transpose(1, 2).contiguous()
265
- w = w.transpose(1, 2)
266
-
267
- def benchmark():
268
- return torch.bmm(x, w)
269
-
270
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
271
- arguments = {
272
- 'sequence_length': sl,
273
- 'hidden_size': hs,
274
- 'ffn_hidden_size': fhs,
275
- 'num_experts': ne,
276
- }
277
- log_benchmark(
278
- '0::Fwd:DDD::NT',
279
- arguments,
280
- mean_t,
281
- std_t,
282
- x.numel() * fhs * 2,
283
- )
284
-
285
- @parameterized.parameters(*_MATMUL_TESTS)
286
- def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
287
- assert (sl % ne) == 0
288
- x = torch.randn((ne, sl // ne, hs)).cuda().half()
289
- w = torch.randn((ne, hs, fhs)).cuda().half()
290
- out = torch.bmm(x, w)
291
- w = w.transpose(1, 2).contiguous()
292
-
293
- def benchmark():
294
- return torch.bmm(out, w)
295
-
296
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
297
- arguments = {
298
- 'sequence_length': sl,
299
- 'hidden_size': hs,
300
- 'ffn_hidden_size': fhs,
301
- 'num_experts': ne,
302
- }
303
- log_benchmark(
304
- '0:GradX:DDD::NN',
305
- arguments,
306
- mean_t,
307
- std_t,
308
- x.numel() * fhs * 2,
309
- )
310
-
311
- @parameterized.parameters(*_MATMUL_TESTS)
312
- def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
313
- assert (sl % ne) == 0
314
- x = torch.randn((ne, sl // ne, hs)).cuda().half()
315
- w = torch.randn((ne, hs, fhs)).cuda().half()
316
- out = torch.bmm(x, w)
317
- out = out.transpose(1, 2)
318
-
319
- def benchmark():
320
- return torch.bmm(out, x)
321
-
322
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
323
- arguments = {
324
- 'sequence_length': sl,
325
- 'hidden_size': hs,
326
- 'ffn_hidden_size': fhs,
327
- 'num_experts': ne,
328
- }
329
- log_benchmark(
330
- '0:GradW:DDD::TN',
331
- arguments,
332
- mean_t,
333
- std_t,
334
- x.numel() * fhs * 2,
335
- )
336
-
337
- @parameterized.parameters(*_MATMUL_TESTS)
338
- def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
339
- assert (sl % ne) == 0
340
- x = torch.randn((ne, sl // ne, fhs)).cuda().half()
341
- w = torch.randn((ne, fhs, hs)).cuda().half()
342
-
343
- def benchmark():
344
- return torch.bmm(x, w)
345
-
346
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
347
- arguments = {
348
- 'sequence_length': sl,
349
- 'hidden_size': hs,
350
- 'ffn_hidden_size': fhs,
351
- 'num_experts': ne,
352
- }
353
- log_benchmark(
354
- '1::Fwd::DDD::NN',
355
- arguments,
356
- mean_t,
357
- std_t,
358
- x.numel() * hs * 2,
359
- )
360
-
361
- @parameterized.parameters(*_MATMUL_TESTS)
362
- def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
363
- assert (sl % ne) == 0
364
- x = torch.randn((ne, sl // ne, fhs)).cuda().half()
365
- w = torch.randn((ne, fhs, hs)).cuda().half()
366
- out = torch.bmm(x, w)
367
- w = torch.transpose(w, 1, 2)
368
-
369
- def benchmark():
370
- return torch.bmm(out, w)
371
-
372
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
373
- arguments = {
374
- 'sequence_length': sl,
375
- 'hidden_size': hs,
376
- 'ffn_hidden_size': fhs,
377
- 'num_experts': ne,
378
- }
379
- log_benchmark(
380
- '1::GradX::DDD::NT',
381
- arguments,
382
- mean_t,
383
- std_t,
384
- x.numel() * hs * 2,
385
- )
386
-
387
- @parameterized.parameters(*_MATMUL_TESTS)
388
- def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
389
- assert (sl % ne) == 0
390
- x = torch.randn((ne, sl // ne, fhs)).cuda().half()
391
- w = torch.randn((ne, fhs, hs)).cuda().half()
392
- out = torch.bmm(x, w)
393
- x = torch.transpose(x, 1, 2)
394
-
395
- def benchmark():
396
- return torch.bmm(x, out)
397
-
398
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
399
- arguments = {
400
- 'sequence_length': sl,
401
- 'hidden_size': hs,
402
- 'ffn_hidden_size': fhs,
403
- 'num_experts': ne,
404
- }
405
- log_benchmark(
406
- '1::GradW::DDD::TN',
407
- arguments,
408
- mean_t,
409
- std_t,
410
- x.numel() * hs * 2,
411
- )
412
 
413
 
414
  if __name__ == '__main__':
 
17
  from .. import stk
18
 
19
  import torch
20
+ # from absl.testing import parameterized
21
 
22
  from .. import benchmark_util, ops
23
 
 
48
  print('=' * 60)
49
 
50
 
51
+ # class MatmulBenchmark(parameterized.TestCase):
52
+ #
53
+ # def build_sparse_matrix(self, x, padded_bins, fhs, ne):
54
+ # blocking = 128
55
+ # padded_tokens, _ = x.size()
56
+ # assert padded_tokens % blocking == 0
57
+ # assert fhs % blocking == 0
58
+ #
59
+ # # Offsets for the sparse matrix. All rows have the
60
+ # # same number of nonzero blocks dictated by the
61
+ # # dimensionality of a single expert.
62
+ # block_rows = padded_tokens // blocking
63
+ # blocks_per_row = fhs // blocking
64
+ # offsets = torch.arange(
65
+ # 0,
66
+ # block_rows * blocks_per_row + 1,
67
+ # blocks_per_row,
68
+ # dtype=torch.int32,
69
+ # device=x.device,
70
+ # )
71
+ #
72
+ # # Indices for the sparse matrix. The indices for
73
+ # # the intermediate matrix are dynamic depending
74
+ # # on the mapping of tokens to experts.
75
+ # column_indices = ops.topology(
76
+ # padded_bins,
77
+ # blocking,
78
+ # block_rows,
79
+ # blocks_per_row,
80
+ # )
81
+ # data = torch.empty(
82
+ # column_indices.numel(),
83
+ # blocking,
84
+ # blocking,
85
+ # dtype=torch.float16,
86
+ # device=x.device,
87
+ # )
88
+ # shape = (padded_tokens, fhs * ne)
89
+ # row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
90
+ # return stk.Matrix(shape, data, row_indices, column_indices, offsets)
91
+ #
92
+ # def build_input_matrix(self, sl, hs, ne):
93
+ # x = torch.randn((sl, hs)).cuda().half()
94
+ #
95
+ # # Assign tokens to experts uniformly.
96
+ # top_expert = torch.arange(0, sl).cuda().int() % ne
97
+ #
98
+ # bin_ids, indices = ops.sort(top_expert)
99
+ # tokens_per_expert = ops.histogram(top_expert, ne)
100
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
101
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
102
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
103
+ # out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
104
+ # return out, padded_bins
105
+ #
106
+ # def build_weight_matrix(self, ne, hs, fhs):
107
+ # return torch.randn((hs, ne * fhs)).cuda().half()
108
+ #
109
+ # @parameterized.parameters(*_MATMUL_TESTS)
110
+ # def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
111
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
112
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
113
+ # topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
114
+ # w = transpose_view(w)
115
+ #
116
+ # def benchmark():
117
+ # return stk.ops.sdd(x, w, topo)
118
+ #
119
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
+ # arguments = {
121
+ # 'sequence_length': sl,
122
+ # 'hidden_size': hs,
123
+ # 'ffn_hidden_size': fhs,
124
+ # 'num_experts': ne,
125
+ # }
126
+ # log_benchmark(
127
+ # '0::Fwd::SDD::NT',
128
+ # arguments,
129
+ # mean_t,
130
+ # std_t,
131
+ # x.numel() * fhs * 2,
132
+ # )
133
+ #
134
+ # @parameterized.parameters(*_MATMUL_TESTS)
135
+ # def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
136
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
137
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
138
+ # topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
139
+ #
140
+ # def benchmark():
141
+ # return stk.ops.dsd(topo, w)
142
+ #
143
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
144
+ # arguments = {
145
+ # 'sequence_length': sl,
146
+ # 'hidden_size': hs,
147
+ # 'ffn_hidden_size': fhs,
148
+ # 'num_experts': ne,
149
+ # }
150
+ # log_benchmark(
151
+ # '0::GradX::DSD::NN',
152
+ # arguments,
153
+ # mean_t,
154
+ # std_t,
155
+ # x.numel() * fhs * 2,
156
+ # )
157
+ #
158
+ # @parameterized.parameters(*_MATMUL_TESTS)
159
+ # def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
160
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
161
+ # topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
162
+ # topo = topo.t()
163
+ #
164
+ # def benchmark():
165
+ # return stk.ops.dsd(topo, x)
166
+ #
167
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
168
+ # arguments = {
169
+ # 'sequence_length': sl,
170
+ # 'hidden_size': hs,
171
+ # 'ffn_hidden_size': fhs,
172
+ # 'num_experts': ne,
173
+ # }
174
+ # log_benchmark(
175
+ # '0::GradW::DSD::TN',
176
+ # arguments,
177
+ # mean_t,
178
+ # std_t,
179
+ # x.numel() * fhs * 2,
180
+ # )
181
+ #
182
+ # @parameterized.parameters(*_MATMUL_TESTS)
183
+ # def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
184
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
185
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
186
+ # x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
187
+ #
188
+ # def benchmark():
189
+ # return stk.ops.dsd(x, w)
190
+ #
191
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
192
+ # arguments = {
193
+ # 'sequence_length': sl,
194
+ # 'hidden_size': hs,
195
+ # 'ffn_hidden_size': fhs,
196
+ # 'num_experts': ne,
197
+ # }
198
+ # log_benchmark(
199
+ # '1::Fwd::DSD::NN',
200
+ # arguments,
201
+ # mean_t,
202
+ # std_t,
203
+ # x.nnz * hs * 2,
204
+ # )
205
+ #
206
+ # @parameterized.parameters(*_MATMUL_TESTS)
207
+ # def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
208
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
209
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
210
+ # x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
211
+ # out = stk.ops.dsd(x, w)
212
+ # w = transpose_view(w)
213
+ #
214
+ # def benchmark():
215
+ # return stk.ops.sdd(out, w, x)
216
+ #
217
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
218
+ # arguments = {
219
+ # 'sequence_length': sl,
220
+ # 'hidden_size': hs,
221
+ # 'ffn_hidden_size': fhs,
222
+ # 'num_experts': ne,
223
+ # }
224
+ # log_benchmark(
225
+ # '1::GradX::SDD::NT',
226
+ # arguments,
227
+ # mean_t,
228
+ # std_t,
229
+ # x.nnz * hs * 2,
230
+ # )
231
+ #
232
+ # @parameterized.parameters(*_MATMUL_TESTS)
233
+ # def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
234
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
235
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
236
+ # x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
237
+ # out = stk.ops.dsd(x, w)
238
+ # x = x.t()
239
+ #
240
+ # def benchmark():
241
+ # return stk.ops.dsd(x, out)
242
+ #
243
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
244
+ # arguments = {
245
+ # 'sequence_length': sl,
246
+ # 'hidden_size': hs,
247
+ # 'ffn_hidden_size': fhs,
248
+ # 'num_experts': ne,
249
+ # }
250
+ # log_benchmark(
251
+ # '1::GradW::DSD::TN',
252
+ # arguments,
253
+ # mean_t,
254
+ # std_t,
255
+ # x.nnz * hs * 2,
256
+ # )
257
+ #
258
+ # @parameterized.parameters(*_MATMUL_TESTS)
259
+ # def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
260
+ # assert (sl % ne) == 0
261
+ # x = torch.randn((ne, sl // ne, hs)).cuda().half()
262
+ # w = torch.randn((ne, hs, fhs)).cuda().half()
263
+ #
264
+ # w = w.transpose(1, 2).contiguous()
265
+ # w = w.transpose(1, 2)
266
+ #
267
+ # def benchmark():
268
+ # return torch.bmm(x, w)
269
+ #
270
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
271
+ # arguments = {
272
+ # 'sequence_length': sl,
273
+ # 'hidden_size': hs,
274
+ # 'ffn_hidden_size': fhs,
275
+ # 'num_experts': ne,
276
+ # }
277
+ # log_benchmark(
278
+ # '0::Fwd:DDD::NT',
279
+ # arguments,
280
+ # mean_t,
281
+ # std_t,
282
+ # x.numel() * fhs * 2,
283
+ # )
284
+ #
285
+ # @parameterized.parameters(*_MATMUL_TESTS)
286
+ # def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
287
+ # assert (sl % ne) == 0
288
+ # x = torch.randn((ne, sl // ne, hs)).cuda().half()
289
+ # w = torch.randn((ne, hs, fhs)).cuda().half()
290
+ # out = torch.bmm(x, w)
291
+ # w = w.transpose(1, 2).contiguous()
292
+ #
293
+ # def benchmark():
294
+ # return torch.bmm(out, w)
295
+ #
296
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
297
+ # arguments = {
298
+ # 'sequence_length': sl,
299
+ # 'hidden_size': hs,
300
+ # 'ffn_hidden_size': fhs,
301
+ # 'num_experts': ne,
302
+ # }
303
+ # log_benchmark(
304
+ # '0:GradX:DDD::NN',
305
+ # arguments,
306
+ # mean_t,
307
+ # std_t,
308
+ # x.numel() * fhs * 2,
309
+ # )
310
+ #
311
+ # @parameterized.parameters(*_MATMUL_TESTS)
312
+ # def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
313
+ # assert (sl % ne) == 0
314
+ # x = torch.randn((ne, sl // ne, hs)).cuda().half()
315
+ # w = torch.randn((ne, hs, fhs)).cuda().half()
316
+ # out = torch.bmm(x, w)
317
+ # out = out.transpose(1, 2)
318
+ #
319
+ # def benchmark():
320
+ # return torch.bmm(out, x)
321
+ #
322
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
323
+ # arguments = {
324
+ # 'sequence_length': sl,
325
+ # 'hidden_size': hs,
326
+ # 'ffn_hidden_size': fhs,
327
+ # 'num_experts': ne,
328
+ # }
329
+ # log_benchmark(
330
+ # '0:GradW:DDD::TN',
331
+ # arguments,
332
+ # mean_t,
333
+ # std_t,
334
+ # x.numel() * fhs * 2,
335
+ # )
336
+ #
337
+ # @parameterized.parameters(*_MATMUL_TESTS)
338
+ # def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
339
+ # assert (sl % ne) == 0
340
+ # x = torch.randn((ne, sl // ne, fhs)).cuda().half()
341
+ # w = torch.randn((ne, fhs, hs)).cuda().half()
342
+ #
343
+ # def benchmark():
344
+ # return torch.bmm(x, w)
345
+ #
346
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
347
+ # arguments = {
348
+ # 'sequence_length': sl,
349
+ # 'hidden_size': hs,
350
+ # 'ffn_hidden_size': fhs,
351
+ # 'num_experts': ne,
352
+ # }
353
+ # log_benchmark(
354
+ # '1::Fwd::DDD::NN',
355
+ # arguments,
356
+ # mean_t,
357
+ # std_t,
358
+ # x.numel() * hs * 2,
359
+ # )
360
+ #
361
+ # @parameterized.parameters(*_MATMUL_TESTS)
362
+ # def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
363
+ # assert (sl % ne) == 0
364
+ # x = torch.randn((ne, sl // ne, fhs)).cuda().half()
365
+ # w = torch.randn((ne, fhs, hs)).cuda().half()
366
+ # out = torch.bmm(x, w)
367
+ # w = torch.transpose(w, 1, 2)
368
+ #
369
+ # def benchmark():
370
+ # return torch.bmm(out, w)
371
+ #
372
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
373
+ # arguments = {
374
+ # 'sequence_length': sl,
375
+ # 'hidden_size': hs,
376
+ # 'ffn_hidden_size': fhs,
377
+ # 'num_experts': ne,
378
+ # }
379
+ # log_benchmark(
380
+ # '1::GradX::DDD::NT',
381
+ # arguments,
382
+ # mean_t,
383
+ # std_t,
384
+ # x.numel() * hs * 2,
385
+ # )
386
+ #
387
+ # @parameterized.parameters(*_MATMUL_TESTS)
388
+ # def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
389
+ # assert (sl % ne) == 0
390
+ # x = torch.randn((ne, sl // ne, fhs)).cuda().half()
391
+ # w = torch.randn((ne, fhs, hs)).cuda().half()
392
+ # out = torch.bmm(x, w)
393
+ # x = torch.transpose(x, 1, 2)
394
+ #
395
+ # def benchmark():
396
+ # return torch.bmm(x, out)
397
+ #
398
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
399
+ # arguments = {
400
+ # 'sequence_length': sl,
401
+ # 'hidden_size': hs,
402
+ # 'ffn_hidden_size': fhs,
403
+ # 'num_experts': ne,
404
+ # }
405
+ # log_benchmark(
406
+ # '1::GradW::DDD::TN',
407
+ # arguments,
408
+ # mean_t,
409
+ # std_t,
410
+ # x.numel() * hs * 2,
411
+ # )
412
 
413
 
414
  if __name__ == '__main__':
build/torch210-cxx11-cpu-x86_64-linux/ops/padded_scatter_benchmark.py CHANGED
@@ -4,7 +4,7 @@
4
  import unittest
5
 
6
  import torch
7
- from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
@@ -16,50 +16,50 @@ _PADDED_SCATTER_BENCHMARK = (
16
  )
17
 
18
 
19
- class PaddedScatterTest(parameterized.TestCase):
20
-
21
- @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
22
- def testPaddedScatter(self, sl, hs, ne, top_k):
23
- # Create the data and indices.
24
- x = torch.randn((sl, hs)).cuda().half()
25
-
26
- # Randomly assign tokens to experts.
27
- top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
28
- bin_ids, indices = ops.sort(top_expert)
29
- tokens_per_expert = ops.histogram(top_expert, ne)
30
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
31
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
32
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
33
-
34
- # Sample weights for the scatter reduce.
35
- weights = torch.rand((sl * top_k,)).cuda().half()
36
-
37
- # Gather the data to prepare for backwards.
38
- x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
39
-
40
- def benchmark():
41
- return ops.padded_scatter(
42
- x,
43
- indices,
44
- bin_ids,
45
- weights,
46
- bins,
47
- padded_bins,
48
- top_k,
49
- )
50
-
51
- time, std = benchmark_util.benchmark_function(benchmark)
52
- benchmark_util.log_benchmark(
53
- 'Padded Scatter',
54
- {
55
- 'sequence_length': sl,
56
- 'hidden_size': hs,
57
- 'num_experts': ne,
58
- 'top_k': top_k,
59
- },
60
- time,
61
- std,
62
- )
63
 
64
 
65
  if __name__ == '__main__':
 
4
  import unittest
5
 
6
  import torch
7
+ # from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
 
16
  )
17
 
18
 
19
+ # class PaddedScatterTest(parameterized.TestCase):
20
+ #
21
+ # @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
22
+ # def testPaddedScatter(self, sl, hs, ne, top_k):
23
+ # # Create the data and indices.
24
+ # x = torch.randn((sl, hs)).cuda().half()
25
+ #
26
+ # # Randomly assign tokens to experts.
27
+ # top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
28
+ # bin_ids, indices = ops.sort(top_expert)
29
+ # tokens_per_expert = ops.histogram(top_expert, ne)
30
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
31
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
32
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
33
+ #
34
+ # # Sample weights for the scatter reduce.
35
+ # weights = torch.rand((sl * top_k,)).cuda().half()
36
+ #
37
+ # # Gather the data to prepare for backwards.
38
+ # x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
39
+ #
40
+ # def benchmark():
41
+ # return ops.padded_scatter(
42
+ # x,
43
+ # indices,
44
+ # bin_ids,
45
+ # weights,
46
+ # bins,
47
+ # padded_bins,
48
+ # top_k,
49
+ # )
50
+ #
51
+ # time, std = benchmark_util.benchmark_function(benchmark)
52
+ # benchmark_util.log_benchmark(
53
+ # 'Padded Scatter',
54
+ # {
55
+ # 'sequence_length': sl,
56
+ # 'hidden_size': hs,
57
+ # 'num_experts': ne,
58
+ # 'top_k': top_k,
59
+ # },
60
+ # time,
61
+ # std,
62
+ # )
63
 
64
 
65
  if __name__ == '__main__':
build/torch210-cxx11-cpu-x86_64-linux/ops/permute_benchmark.py CHANGED
@@ -4,7 +4,7 @@
4
  import unittest
5
 
6
  import torch
7
- from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
@@ -26,123 +26,123 @@ _PERMUTE_TESTS = (
26
  )
27
 
28
 
29
- class PermuteBenchmark(parameterized.TestCase):
30
-
31
- @parameterized.parameters(*_PERMUTE_TESTS)
32
- def testBinnedGather(self, sl, hs, ne):
33
- # NOTE: Capacity factor == 1.
34
- ec = sl // ne
35
-
36
- # Create the data and indices.
37
- x = torch.randn((sl, hs)).cuda().half()
38
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
39
- bin_ids, indices = ops.sort(top_expert)
40
- tokens_per_expert = ops.histogram(indices, ne)
41
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
42
-
43
- def benchmark():
44
- return ops.binned_gather(x, indices, bins, ec)
45
-
46
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
47
- arguments = {
48
- 'sequence_length': sl,
49
- 'hidden_size': hs,
50
- 'num_experts': ne,
51
- }
52
- benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
53
-
54
- @parameterized.parameters(*_PERMUTE_TESTS)
55
- def testBinnedScatter(self, sl, hs, ne):
56
- # NOTE: Capacity factor == 1.
57
- ec = sl // ne
58
-
59
- # Create the data and indices.
60
- x = torch.randn((sl, hs)).cuda().half()
61
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
62
- bin_ids, indices = ops.sort(top_expert)
63
- tokens_per_expert = ops.histogram(indices, ne)
64
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
65
- x = ops.binned_gather(x, indices, bins, ec)
66
-
67
- def benchmark():
68
- return ops.binned_scatter(x, indices, bins)
69
-
70
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
71
- arguments = {
72
- 'sequence_length': sl,
73
- 'hidden_size': hs,
74
- 'num_experts': ne,
75
- }
76
- benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
77
-
78
- @parameterized.parameters(*_PERMUTE_TESTS)
79
- def testPaddedGather(self, sl, hs, ne):
80
- # Create the data and indices.
81
- x = torch.randn((sl, hs)).cuda().half()
82
-
83
- # Randomly assign tokens to experts.
84
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
85
- bin_ids, indices = ops.sort(top_expert)
86
- tokens_per_expert = ops.histogram(top_expert, ne)
87
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
88
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
89
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
90
-
91
- def benchmark():
92
- return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
93
-
94
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
95
- arguments = {
96
- 'sequence_length': sl,
97
- 'hidden_size': hs,
98
- 'num_experts': ne,
99
- }
100
- benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
101
-
102
- @parameterized.parameters(*_PERMUTE_TESTS)
103
- def testPaddedScatter(self, sl, hs, ne):
104
- # Create the data and indices.
105
- x = torch.randn((sl, hs)).cuda().half()
106
-
107
- # Randomly assign tokens to experts.
108
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
109
- bin_ids, indices = ops.sort(top_expert)
110
- tokens_per_expert = ops.histogram(top_expert, ne)
111
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
112
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
113
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
114
- x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
115
-
116
- def benchmark():
117
- return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
118
-
119
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
- arguments = {
121
- 'sequence_length': sl,
122
- 'hidden_size': hs,
123
- 'num_experts': ne,
124
- }
125
- benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
126
-
127
- @parameterized.parameters(*_PERMUTE_TESTS)
128
- def testCopy(self, sl, hs, ne):
129
- # NOTE: Capacity factor == 1.
130
- # ec = sl // ne
131
-
132
- # Create the data and indices.
133
- x = torch.randn((sl, hs)).cuda().half()
134
- y = x.clone()
135
-
136
- def benchmark():
137
- return y.copy_(x)
138
-
139
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
140
- arguments = {
141
- 'sequence_length': sl,
142
- 'hidden_size': hs,
143
- 'num_experts': ne,
144
- }
145
- benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
146
 
147
 
148
  if __name__ == '__main__':
 
4
  import unittest
5
 
6
  import torch
7
+ # from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
 
26
  )
27
 
28
 
29
+ # class PermuteBenchmark(parameterized.TestCase):
30
+ #
31
+ # @parameterized.parameters(*_PERMUTE_TESTS)
32
+ # def testBinnedGather(self, sl, hs, ne):
33
+ # # NOTE: Capacity factor == 1.
34
+ # ec = sl // ne
35
+ #
36
+ # # Create the data and indices.
37
+ # x = torch.randn((sl, hs)).cuda().half()
38
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
39
+ # bin_ids, indices = ops.sort(top_expert)
40
+ # tokens_per_expert = ops.histogram(indices, ne)
41
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
42
+ #
43
+ # def benchmark():
44
+ # return ops.binned_gather(x, indices, bins, ec)
45
+ #
46
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
47
+ # arguments = {
48
+ # 'sequence_length': sl,
49
+ # 'hidden_size': hs,
50
+ # 'num_experts': ne,
51
+ # }
52
+ # benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
53
+ #
54
+ # @parameterized.parameters(*_PERMUTE_TESTS)
55
+ # def testBinnedScatter(self, sl, hs, ne):
56
+ # # NOTE: Capacity factor == 1.
57
+ # ec = sl // ne
58
+ #
59
+ # # Create the data and indices.
60
+ # x = torch.randn((sl, hs)).cuda().half()
61
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
62
+ # bin_ids, indices = ops.sort(top_expert)
63
+ # tokens_per_expert = ops.histogram(indices, ne)
64
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
65
+ # x = ops.binned_gather(x, indices, bins, ec)
66
+ #
67
+ # def benchmark():
68
+ # return ops.binned_scatter(x, indices, bins)
69
+ #
70
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
71
+ # arguments = {
72
+ # 'sequence_length': sl,
73
+ # 'hidden_size': hs,
74
+ # 'num_experts': ne,
75
+ # }
76
+ # benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
77
+ #
78
+ # @parameterized.parameters(*_PERMUTE_TESTS)
79
+ # def testPaddedGather(self, sl, hs, ne):
80
+ # # Create the data and indices.
81
+ # x = torch.randn((sl, hs)).cuda().half()
82
+ #
83
+ # # Randomly assign tokens to experts.
84
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
85
+ # bin_ids, indices = ops.sort(top_expert)
86
+ # tokens_per_expert = ops.histogram(top_expert, ne)
87
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
88
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
89
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
90
+ #
91
+ # def benchmark():
92
+ # return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
93
+ #
94
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
95
+ # arguments = {
96
+ # 'sequence_length': sl,
97
+ # 'hidden_size': hs,
98
+ # 'num_experts': ne,
99
+ # }
100
+ # benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
101
+ #
102
+ # @parameterized.parameters(*_PERMUTE_TESTS)
103
+ # def testPaddedScatter(self, sl, hs, ne):
104
+ # # Create the data and indices.
105
+ # x = torch.randn((sl, hs)).cuda().half()
106
+ #
107
+ # # Randomly assign tokens to experts.
108
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
109
+ # bin_ids, indices = ops.sort(top_expert)
110
+ # tokens_per_expert = ops.histogram(top_expert, ne)
111
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
112
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
113
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
114
+ # x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
115
+ #
116
+ # def benchmark():
117
+ # return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
118
+ #
119
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
+ # arguments = {
121
+ # 'sequence_length': sl,
122
+ # 'hidden_size': hs,
123
+ # 'num_experts': ne,
124
+ # }
125
+ # benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
126
+ #
127
+ # @parameterized.parameters(*_PERMUTE_TESTS)
128
+ # def testCopy(self, sl, hs, ne):
129
+ # # NOTE: Capacity factor == 1.
130
+ # # ec = sl // ne
131
+ #
132
+ # # Create the data and indices.
133
+ # x = torch.randn((sl, hs)).cuda().half()
134
+ # y = x.clone()
135
+ #
136
+ # def benchmark():
137
+ # return y.copy_(x)
138
+ #
139
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
140
+ # arguments = {
141
+ # 'sequence_length': sl,
142
+ # 'hidden_size': hs,
143
+ # 'num_experts': ne,
144
+ # }
145
+ # benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
146
 
147
 
148
  if __name__ == '__main__':
build/torch210-cxx11-cpu-x86_64-linux/ops/sort_benchmark.py CHANGED
@@ -5,7 +5,7 @@ import unittest
5
 
6
  import numpy as np
7
  import torch
8
- from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
@@ -53,32 +53,32 @@ def log_benchmark(arguments, mean_t, std_t):
53
  print('=' * 60)
54
 
55
 
56
- class SortBenchmark(parameterized.TestCase):
57
-
58
- @parameterized.parameters(*_SORT_TESTS)
59
- def testSort(self, n, dtype, max_val):
60
- if max_val is None:
61
- max_val = np.iinfo(numpy_dtype(dtype)).max
62
- end_bit = int(np.ceil(np.log2(max_val)))
63
- x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
64
-
65
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
66
- arguments = {
67
- 'n': n,
68
- 'dtype': dtype,
69
- 'max_val': max_val,
70
- }
71
- log_benchmark(arguments, mean_t, std_t)
72
-
73
- @parameterized.parameters(*_BASELINE_SORT_TESTS)
74
- def testTorchSort(self, n):
75
- x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
76
-
77
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
78
- arguments = {
79
- 'n': n,
80
- }
81
- log_benchmark(arguments, mean_t, std_t)
82
 
83
 
84
  if __name__ == '__main__':
 
5
 
6
  import numpy as np
7
  import torch
8
+ # from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
 
53
  print('=' * 60)
54
 
55
 
56
+ # class SortBenchmark(parameterized.TestCase):
57
+ #
58
+ # @parameterized.parameters(*_SORT_TESTS)
59
+ # def testSort(self, n, dtype, max_val):
60
+ # if max_val is None:
61
+ # max_val = np.iinfo(numpy_dtype(dtype)).max
62
+ # end_bit = int(np.ceil(np.log2(max_val)))
63
+ # x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
64
+ #
65
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
66
+ # arguments = {
67
+ # 'n': n,
68
+ # 'dtype': dtype,
69
+ # 'max_val': max_val,
70
+ # }
71
+ # log_benchmark(arguments, mean_t, std_t)
72
+ #
73
+ # @parameterized.parameters(*_BASELINE_SORT_TESTS)
74
+ # def testTorchSort(self, n):
75
+ # x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
76
+ #
77
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
78
+ # arguments = {
79
+ # 'n': n,
80
+ # }
81
+ # log_benchmark(arguments, mean_t, std_t)
82
 
83
 
84
  if __name__ == '__main__':
build/torch210-cxx11-cpu-x86_64-linux/stk/ops/eltwise_ops_test.py CHANGED
@@ -1,7 +1,7 @@
1
  import unittest
2
  import itertools
3
  import torch
4
- from absl.testing import parameterized
5
 
6
  import stk
7
  from stk.ops.linear_ops_test import allclose, _dense_and_sparse
@@ -47,40 +47,40 @@ def _dense_and_sparse_like(x, std=0.1):
47
  return (dense.requires_grad_(True),
48
  sparse.requires_grad_(True))
49
 
50
- @parameterized.parameters(_ELTWISE_OP_TESTS)
51
- class EltwiseOpsTest(parameterized.TestCase):
52
-
53
- def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
54
-
55
- a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
56
- b_dense, b = _dense_and_sparse_like(a)
57
-
58
- out = stk.ops.mul(a, b)
59
- expected_out = torch.mul(a_dense, b_dense)
60
-
61
- # Compute the gradients w.r.t. the inputs.
62
- expected_out.sum().backward()
63
- stk.ops.sum(out).backward()
64
-
65
- # Validate the results.
66
- out = stk.ops.to_dense(out)
67
- self.assertEqual(out.dim(), 2)
68
- self.assertEqual(expected_out.size(), out.size())
69
- self.assertTrue(allclose(out, expected_out))
70
-
71
- # LHS gradient.
72
- grad = stk.ops.to_dense(a.grad)
73
- expected_grad = a_dense.grad
74
- self.assertEqual(grad.dim(), 2)
75
- self.assertEqual(expected_grad.size(), grad.size())
76
- self.assertTrue(allclose(grad, expected_grad))
77
-
78
- # RHS gradient.
79
- grad = stk.ops.to_dense(b.grad)
80
- expected_grad = b_dense.grad
81
- self.assertEqual(grad.dim(), 2)
82
- self.assertEqual(expected_grad.size(), grad.size())
83
- self.assertTrue(allclose(grad, expected_grad))
84
 
85
  if __name__ == '__main__':
86
  unittest.main()
 
1
  import unittest
2
  import itertools
3
  import torch
4
+ # from absl.testing import parameterized
5
 
6
  import stk
7
  from stk.ops.linear_ops_test import allclose, _dense_and_sparse
 
47
  return (dense.requires_grad_(True),
48
  sparse.requires_grad_(True))
49
 
50
+ # @parameterized.parameters(_ELTWISE_OP_TESTS)
51
+ # class EltwiseOpsTest(parameterized.TestCase):
52
+ #
53
+ # def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
54
+ #
55
+ # a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
56
+ # b_dense, b = _dense_and_sparse_like(a)
57
+ #
58
+ # out = stk.ops.mul(a, b)
59
+ # expected_out = torch.mul(a_dense, b_dense)
60
+ #
61
+ # # Compute the gradients w.r.t. the inputs.
62
+ # expected_out.sum().backward()
63
+ # stk.ops.sum(out).backward()
64
+ #
65
+ # # Validate the results.
66
+ # out = stk.ops.to_dense(out)
67
+ # self.assertEqual(out.dim(), 2)
68
+ # self.assertEqual(expected_out.size(), out.size())
69
+ # self.assertTrue(allclose(out, expected_out))
70
+ #
71
+ # # LHS gradient.
72
+ # grad = stk.ops.to_dense(a.grad)
73
+ # expected_grad = a_dense.grad
74
+ # self.assertEqual(grad.dim(), 2)
75
+ # self.assertEqual(expected_grad.size(), grad.size())
76
+ # self.assertTrue(allclose(grad, expected_grad))
77
+ #
78
+ # # RHS gradient.
79
+ # grad = stk.ops.to_dense(b.grad)
80
+ # expected_grad = b_dense.grad
81
+ # self.assertEqual(grad.dim(), 2)
82
+ # self.assertEqual(expected_grad.size(), grad.size())
83
+ # self.assertTrue(allclose(grad, expected_grad))
84
 
85
  if __name__ == '__main__':
86
  unittest.main()
build/torch210-cxx11-cpu-x86_64-linux/stk/ops/linear_ops_test.py CHANGED
@@ -2,7 +2,7 @@ import unittest
2
  import itertools
3
  import numpy as np
4
  import torch
5
- from absl.testing import parameterized
6
 
7
  import stk
8
 
@@ -96,121 +96,121 @@ def _mask(x, mask):
96
  return x * mask
97
 
98
 
99
- @parameterized.parameters(*_LINEAR_OP_TESTS)
100
- class LinearOpsTest(parameterized.TestCase):
101
-
102
- def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
103
- # Construct the operands.
104
- a_shape = (k, m) if trans_a else (m, k)
105
- a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
106
- b_shape = (n, k) if trans_b else (k, n)
107
- b, bcp = _dense_2x(*b_shape, dtype)
108
-
109
- # Execute the matmul.
110
- out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
111
- expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
112
-
113
- # Compute the gradients w.r.t. the inputs.
114
- expected_out.sum().backward()
115
- out.sum().backward()
116
-
117
- # Validate the results.
118
- self.assertEqual(out.dim(), 2)
119
- self.assertEqual(expected_out.size()[0], out.size()[0])
120
- self.assertEqual(expected_out.size()[1], out.size()[1])
121
- self.assertTrue(allclose(out, expected_out))
122
-
123
- # LHS gradient.
124
- grad = stk.ops.to_dense(a.grad)
125
- expected_grad = _mask(a_dense.grad, a.grad)
126
- self.assertEqual(grad.dim(), 2)
127
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
128
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
129
- self.assertTrue(allclose(grad, expected_grad))
130
-
131
- # RHS gradient.
132
- grad = b.grad
133
- expected_grad = bcp.grad
134
- self.assertEqual(grad.dim(), 2)
135
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
136
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
137
- self.assertTrue(allclose(grad, expected_grad))
138
-
139
- def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
140
- # Construct the operands.
141
- a_shape = (k, m) if trans_a else (m, k)
142
- a, acp = _dense_2x(*a_shape, dtype)
143
- b_shape = (n, k) if trans_b else (k, n)
144
- b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
145
-
146
- # Execute the matmul.
147
- out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
148
- expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
149
-
150
- # Compute the gradients w.r.t. the inputs.
151
- expected_out.sum().backward()
152
- out.sum().backward()
153
-
154
- # Validate the results.
155
- self.assertEqual(out.dim(), 2)
156
- self.assertEqual(expected_out.size()[0], out.size()[0])
157
- self.assertEqual(expected_out.size()[1], out.size()[1])
158
- self.assertTrue(allclose(out, expected_out))
159
-
160
- # LHS gradient.
161
- grad = a.grad
162
- expected_grad = acp.grad
163
- self.assertEqual(grad.dim(), 2)
164
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
165
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
166
- self.assertTrue(allclose(grad, expected_grad))
167
-
168
- # RHS gradient.
169
- grad = stk.ops.to_dense(b.grad)
170
- expected_grad = _mask(b_dense.grad, b.grad)
171
- self.assertEqual(grad.dim(), 2)
172
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
173
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
174
- self.assertTrue(allclose(grad, expected_grad))
175
-
176
- def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
177
- # Construct the operands.
178
- a_shape = (k, m) if trans_a else (m, k)
179
- a, acp = _dense_2x(*a_shape, dtype)
180
- b_shape = (n, k) if trans_b else (k, n)
181
- b, bcp = _dense_2x(*b_shape, dtype)
182
- _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
183
-
184
- # Execute the matmul.
185
- out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
186
- expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
187
-
188
- # Compute the gradients w.r.t. the inputs.
189
- expected_out.sum().backward()
190
- stk.ops.sum(out).backward()
191
-
192
- # Validate the results.
193
- out = stk.ops.to_dense(out)
194
- self.assertEqual(out.dim(), 2)
195
- self.assertEqual(expected_out.size()[0], out.size()[0])
196
- self.assertEqual(expected_out.size()[1], out.size()[1])
197
- self.assertTrue(allclose(out, expected_out))
198
-
199
- # LHS gradient.
200
- grad = a.grad
201
- expected_grad = acp.grad
202
- self.assertEqual(grad.dim(), 2)
203
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
204
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
205
- self.assertTrue(allclose(grad, expected_grad))
206
-
207
- # RHS gradient.
208
- grad = b.grad
209
- expected_grad = bcp.grad
210
- self.assertEqual(grad.dim(), 2)
211
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
212
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
213
- self.assertTrue(allclose(grad, expected_grad))
214
 
215
  if __name__ == '__main__':
216
  unittest.main()
 
2
  import itertools
3
  import numpy as np
4
  import torch
5
+ # from absl.testing import parameterized
6
 
7
  import stk
8
 
 
96
  return x * mask
97
 
98
 
99
+ # @parameterized.parameters(*_LINEAR_OP_TESTS)
100
+ # class LinearOpsTest(parameterized.TestCase):
101
+ #
102
+ # def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
103
+ # # Construct the operands.
104
+ # a_shape = (k, m) if trans_a else (m, k)
105
+ # a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
106
+ # b_shape = (n, k) if trans_b else (k, n)
107
+ # b, bcp = _dense_2x(*b_shape, dtype)
108
+ #
109
+ # # Execute the matmul.
110
+ # out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
111
+ # expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
112
+ #
113
+ # # Compute the gradients w.r.t. the inputs.
114
+ # expected_out.sum().backward()
115
+ # out.sum().backward()
116
+ #
117
+ # # Validate the results.
118
+ # self.assertEqual(out.dim(), 2)
119
+ # self.assertEqual(expected_out.size()[0], out.size()[0])
120
+ # self.assertEqual(expected_out.size()[1], out.size()[1])
121
+ # self.assertTrue(allclose(out, expected_out))
122
+ #
123
+ # # LHS gradient.
124
+ # grad = stk.ops.to_dense(a.grad)
125
+ # expected_grad = _mask(a_dense.grad, a.grad)
126
+ # self.assertEqual(grad.dim(), 2)
127
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
128
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
129
+ # self.assertTrue(allclose(grad, expected_grad))
130
+ #
131
+ # # RHS gradient.
132
+ # grad = b.grad
133
+ # expected_grad = bcp.grad
134
+ # self.assertEqual(grad.dim(), 2)
135
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
136
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
137
+ # self.assertTrue(allclose(grad, expected_grad))
138
+ #
139
+ # def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
140
+ # # Construct the operands.
141
+ # a_shape = (k, m) if trans_a else (m, k)
142
+ # a, acp = _dense_2x(*a_shape, dtype)
143
+ # b_shape = (n, k) if trans_b else (k, n)
144
+ # b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
145
+ #
146
+ # # Execute the matmul.
147
+ # out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
148
+ # expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
149
+ #
150
+ # # Compute the gradients w.r.t. the inputs.
151
+ # expected_out.sum().backward()
152
+ # out.sum().backward()
153
+ #
154
+ # # Validate the results.
155
+ # self.assertEqual(out.dim(), 2)
156
+ # self.assertEqual(expected_out.size()[0], out.size()[0])
157
+ # self.assertEqual(expected_out.size()[1], out.size()[1])
158
+ # self.assertTrue(allclose(out, expected_out))
159
+ #
160
+ # # LHS gradient.
161
+ # grad = a.grad
162
+ # expected_grad = acp.grad
163
+ # self.assertEqual(grad.dim(), 2)
164
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
165
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
166
+ # self.assertTrue(allclose(grad, expected_grad))
167
+ #
168
+ # # RHS gradient.
169
+ # grad = stk.ops.to_dense(b.grad)
170
+ # expected_grad = _mask(b_dense.grad, b.grad)
171
+ # self.assertEqual(grad.dim(), 2)
172
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
173
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
174
+ # self.assertTrue(allclose(grad, expected_grad))
175
+ #
176
+ # def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
177
+ # # Construct the operands.
178
+ # a_shape = (k, m) if trans_a else (m, k)
179
+ # a, acp = _dense_2x(*a_shape, dtype)
180
+ # b_shape = (n, k) if trans_b else (k, n)
181
+ # b, bcp = _dense_2x(*b_shape, dtype)
182
+ # _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
183
+ #
184
+ # # Execute the matmul.
185
+ # out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
186
+ # expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
187
+ #
188
+ # # Compute the gradients w.r.t. the inputs.
189
+ # expected_out.sum().backward()
190
+ # stk.ops.sum(out).backward()
191
+ #
192
+ # # Validate the results.
193
+ # out = stk.ops.to_dense(out)
194
+ # self.assertEqual(out.dim(), 2)
195
+ # self.assertEqual(expected_out.size()[0], out.size()[0])
196
+ # self.assertEqual(expected_out.size()[1], out.size()[1])
197
+ # self.assertTrue(allclose(out, expected_out))
198
+ #
199
+ # # LHS gradient.
200
+ # grad = a.grad
201
+ # expected_grad = acp.grad
202
+ # self.assertEqual(grad.dim(), 2)
203
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
204
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
205
+ # self.assertTrue(allclose(grad, expected_grad))
206
+ #
207
+ # # RHS gradient.
208
+ # grad = b.grad
209
+ # expected_grad = bcp.grad
210
+ # self.assertEqual(grad.dim(), 2)
211
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
212
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
213
+ # self.assertTrue(allclose(grad, expected_grad))
214
 
215
  if __name__ == '__main__':
216
  unittest.main()
build/torch210-cxx11-cpu-x86_64-linux/stk/ops/matrix_ops_test.py CHANGED
@@ -1,61 +1,61 @@
1
  import unittest
2
 
3
- from absl.testing import parameterized
4
  import stk
5
  import torch
6
 
7
 
8
- @parameterized.parameters(
9
- (8, 16, 0.0, 1),
10
- (8, 16, 0.5, 1),
11
- (8, 16, .95, 1),
12
- (16, 8, 0.0, 1),
13
- (16, 8, 0.5, 1),
14
- (16, 8, .95, 1),
15
- (8, 16, 0.0, 8),
16
- (8, 16, 0.5, 8),
17
- (8, 16, 1.0, 8),
18
- (16, 8, 0.0, 8),
19
- (16, 8, 0.5, 8),
20
- (16, 8, 1.0, 8),
21
- (128, 256, 0.5, 16),
22
- (256, 128, 0.75, 32),
23
- (512, 512, .875, 128))
24
- class MatrixOpsTest(parameterized.TestCase):
25
-
26
- def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
27
- mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
28
- x = (torch.randn(rows, cols) * mask).type(torch.float16)
29
-
30
- # Convert the matrix to sparse format.
31
- sparse_x = stk.ops.to_sparse(x, blocking)
32
-
33
- # Validate the matrix.
34
- sparse_x.validate()
35
-
36
- # Validate the shape.
37
- self.assertEqual(sparse_x.dim(), 2)
38
- self.assertEqual(sparse_x.size()[0], rows)
39
- self.assertEqual(sparse_x.size()[1], cols)
40
-
41
- # Validate the sparsity.
42
- numblocks = rows // blocking * cols // blocking
43
- nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
44
- self.assertEqual(sparse_x.nnz, nnz)
45
-
46
- # Convert back to dense format.
47
- dense_x = stk.ops.to_dense(sparse_x)
48
-
49
- # Validate the shape.
50
- self.assertEqual(dense_x.dim(), 2)
51
- self.assertEqual(dense_x.size()[0], rows)
52
- self.assertEqual(dense_x.size()[1], cols)
53
-
54
- # Validate the sparsity
55
- self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
56
-
57
- # Validate the output.
58
- self.assertTrue(torch.all(torch.eq(x, dense_x)))
59
 
60
 
61
  if __name__ == '__main__':
 
1
  import unittest
2
 
3
+ # from absl.testing import parameterized
4
  import stk
5
  import torch
6
 
7
 
8
+ # @parameterized.parameters(
9
+ # (8, 16, 0.0, 1),
10
+ # (8, 16, 0.5, 1),
11
+ # (8, 16, .95, 1),
12
+ # (16, 8, 0.0, 1),
13
+ # (16, 8, 0.5, 1),
14
+ # (16, 8, .95, 1),
15
+ # (8, 16, 0.0, 8),
16
+ # (8, 16, 0.5, 8),
17
+ # (8, 16, 1.0, 8),
18
+ # (16, 8, 0.0, 8),
19
+ # (16, 8, 0.5, 8),
20
+ # (16, 8, 1.0, 8),
21
+ # (128, 256, 0.5, 16),
22
+ # (256, 128, 0.75, 32),
23
+ # (512, 512, .875, 128))
24
+ # class MatrixOpsTest(parameterized.TestCase):
25
+ #
26
+ # def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
27
+ # mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
28
+ # x = (torch.randn(rows, cols) * mask).type(torch.float16)
29
+ #
30
+ # # Convert the matrix to sparse format.
31
+ # sparse_x = stk.ops.to_sparse(x, blocking)
32
+ #
33
+ # # Validate the matrix.
34
+ # sparse_x.validate()
35
+ #
36
+ # # Validate the shape.
37
+ # self.assertEqual(sparse_x.dim(), 2)
38
+ # self.assertEqual(sparse_x.size()[0], rows)
39
+ # self.assertEqual(sparse_x.size()[1], cols)
40
+ #
41
+ # # Validate the sparsity.
42
+ # numblocks = rows // blocking * cols // blocking
43
+ # nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
44
+ # self.assertEqual(sparse_x.nnz, nnz)
45
+ #
46
+ # # Convert back to dense format.
47
+ # dense_x = stk.ops.to_dense(sparse_x)
48
+ #
49
+ # # Validate the shape.
50
+ # self.assertEqual(dense_x.dim(), 2)
51
+ # self.assertEqual(dense_x.size()[0], rows)
52
+ # self.assertEqual(dense_x.size()[1], cols)
53
+ #
54
+ # # Validate the sparsity
55
+ # self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
56
+ #
57
+ # # Validate the output.
58
+ # self.assertTrue(torch.all(torch.eq(x, dense_x)))
59
 
60
 
61
  if __name__ == '__main__':
build/torch210-cxx11-cpu-x86_64-linux/stk/random/random_ops_test.py CHANGED
@@ -1,72 +1,72 @@
1
  import unittest
2
 
3
- from absl.testing import parameterized
4
  from . import random
5
  import torch
6
 
7
 
8
- @parameterized.parameters(
9
- (8, 16, 0.0, 1),
10
- (8, 16, 0.5, 1),
11
- (8, 16, .95, 1),
12
- (16, 8, 0.0, 1),
13
- (16, 8, 0.5, 1),
14
- (16, 8, .95, 1),
15
- (8, 16, 0.0, 8),
16
- (8, 16, 0.5, 8),
17
- (8, 16, 1.0, 8),
18
- (16, 8, 0.0, 8),
19
- (16, 8, 0.5, 8),
20
- (16, 8, 1.0, 8),
21
- (128, 256, 0.5, 16),
22
- (256, 128, 0.75, 32),
23
- (512, 512, .875, 128))
24
- class RandomOpsTest(parameterized.TestCase):
25
-
26
- def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
27
- mask = random.dense_mask(
28
- rows, cols, sparsity, blocking)
29
-
30
- # Validate the shape.
31
- self.assertEqual(mask.dim(), 2)
32
- self.assertEqual(mask.size()[0], rows)
33
- self.assertEqual(mask.size()[1], cols)
34
-
35
- # Validate the sparsity
36
- numblocks = rows // blocking * cols // blocking
37
- nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
38
- self.assertEqual(
39
- torch.count_nonzero(mask).item(),
40
- nnz)
41
-
42
- # Check values are zero or one.
43
- self.assertTrue(
44
- torch.all(torch.logical_or(
45
- torch.eq(mask, 0),
46
- torch.eq(mask, 1))))
47
-
48
- def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
49
- mask = random.mask(
50
- rows, cols, sparsity, blocking)
51
-
52
- # Validate the matrix.
53
- mask.validate()
54
-
55
- # Validate the shape.
56
- self.assertEqual(mask.dim(), 2)
57
- self.assertEqual(mask.size()[0], rows)
58
- self.assertEqual(mask.size()[1], cols)
59
-
60
- # Validate the sparsity.
61
- numblocks = rows // blocking * cols // blocking
62
- nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
63
- self.assertEqual(mask.nnz, nnz)
64
-
65
- # Check values are zero or one.
66
- self.assertTrue(
67
- torch.all(torch.logical_or(
68
- torch.eq(mask.data, 0),
69
- torch.eq(mask.data, 1))))
70
 
71
 
72
  if __name__ == '__main__':
 
1
  import unittest
2
 
3
+ # from absl.testing import parameterized
4
  from . import random
5
  import torch
6
 
7
 
8
+ # @parameterized.parameters(
9
+ # (8, 16, 0.0, 1),
10
+ # (8, 16, 0.5, 1),
11
+ # (8, 16, .95, 1),
12
+ # (16, 8, 0.0, 1),
13
+ # (16, 8, 0.5, 1),
14
+ # (16, 8, .95, 1),
15
+ # (8, 16, 0.0, 8),
16
+ # (8, 16, 0.5, 8),
17
+ # (8, 16, 1.0, 8),
18
+ # (16, 8, 0.0, 8),
19
+ # (16, 8, 0.5, 8),
20
+ # (16, 8, 1.0, 8),
21
+ # (128, 256, 0.5, 16),
22
+ # (256, 128, 0.75, 32),
23
+ # (512, 512, .875, 128))
24
+ # class RandomOpsTest(parameterized.TestCase):
25
+ #
26
+ # def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
27
+ # mask = random.dense_mask(
28
+ # rows, cols, sparsity, blocking)
29
+ #
30
+ # # Validate the shape.
31
+ # self.assertEqual(mask.dim(), 2)
32
+ # self.assertEqual(mask.size()[0], rows)
33
+ # self.assertEqual(mask.size()[1], cols)
34
+ #
35
+ # # Validate the sparsity
36
+ # numblocks = rows // blocking * cols // blocking
37
+ # nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
38
+ # self.assertEqual(
39
+ # torch.count_nonzero(mask).item(),
40
+ # nnz)
41
+ #
42
+ # # Check values are zero or one.
43
+ # self.assertTrue(
44
+ # torch.all(torch.logical_or(
45
+ # torch.eq(mask, 0),
46
+ # torch.eq(mask, 1))))
47
+ #
48
+ # def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
49
+ # mask = random.mask(
50
+ # rows, cols, sparsity, blocking)
51
+ #
52
+ # # Validate the matrix.
53
+ # mask.validate()
54
+ #
55
+ # # Validate the shape.
56
+ # self.assertEqual(mask.dim(), 2)
57
+ # self.assertEqual(mask.size()[0], rows)
58
+ # self.assertEqual(mask.size()[1], cols)
59
+ #
60
+ # # Validate the sparsity.
61
+ # numblocks = rows // blocking * cols // blocking
62
+ # nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
63
+ # self.assertEqual(mask.nnz, nnz)
64
+ #
65
+ # # Check values are zero or one.
66
+ # self.assertTrue(
67
+ # torch.all(torch.logical_or(
68
+ # torch.eq(mask.data, 0),
69
+ # torch.eq(mask.data, 1))))
70
 
71
 
72
  if __name__ == '__main__':
build/torch210-cxx11-cu126-x86_64-linux/{_megablocks_cuda_6e04dec.abi3.so → _megablocks_cuda_a45325d.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55948eae893317a5e500315e47efd66c4482bb67449caef3f512b2cabffb7dc6
3
  size 15061056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a96ca4ac1ee02742edef4fb7f45497be39d31dc897f35a7c1a3663e1c41e050c
3
  size 15061056
build/torch210-cxx11-cu126-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _megablocks_cuda_6e04dec
3
- ops = torch.ops._megablocks_cuda_6e04dec
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_megablocks_cuda_6e04dec::{op_name}"
 
1
  import torch
2
+ from . import _megablocks_cuda_a45325d
3
+ ops = torch.ops._megablocks_cuda_a45325d
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_megablocks_cuda_a45325d::{op_name}"
build/torch210-cxx11-cu126-x86_64-linux/megablocks/__init__.py CHANGED
@@ -1,10 +1,10 @@
1
  import ctypes
 
2
  import sys
3
-
4
- import importlib
5
  from pathlib import Path
6
  from types import ModuleType
7
 
 
8
  def _import_from_path(file_path: Path) -> ModuleType:
9
  # We cannot use the module name as-is, after adding it to `sys.modules`,
10
  # it would also be used for other imports. So, we make a module name that
 
1
  import ctypes
2
+ import importlib.util
3
  import sys
 
 
4
  from pathlib import Path
5
  from types import ModuleType
6
 
7
+
8
  def _import_from_path(file_path: Path) -> ModuleType:
9
  # We cannot use the module name as-is, after adding it to `sys.modules`,
10
  # it would also be used for other imports. So, we make a module name that
build/torch210-cxx11-cu126-x86_64-linux/ops/histogram_benchmark.py CHANGED
@@ -5,7 +5,7 @@ import unittest
5
 
6
  import numpy as np
7
  import torch
8
- from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
@@ -47,31 +47,31 @@ def log_benchmark(arguments, mean_t, std_t):
47
  print('=' * 60)
48
 
49
 
50
- class HistogramBenchmark(parameterized.TestCase):
51
-
52
- @parameterized.parameters(*_HISTOGRAM_TESTS)
53
- def testHistogram(self, n, dtype, max_val):
54
- x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
55
-
56
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
57
- arguments = {
58
- 'n': n,
59
- 'dtype': dtype,
60
- 'max_val': max_val,
61
- }
62
- log_benchmark(arguments, mean_t, std_t)
63
-
64
- @parameterized.parameters(*_HISTOGRAM_TESTS)
65
- def testTorchHistogram(self, n, dtype, max_val):
66
- x = torch.randint(0, 128, (n,)).cuda().to(dtype)
67
-
68
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
69
- arguments = {
70
- 'n': n,
71
- 'dtype': dtype,
72
- 'max_val': max_val,
73
- }
74
- log_benchmark(arguments, mean_t, std_t)
75
 
76
 
77
  if __name__ == '__main__':
 
5
 
6
  import numpy as np
7
  import torch
8
+ # from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
 
47
  print('=' * 60)
48
 
49
 
50
+ # class HistogramBenchmark(parameterized.TestCase):
51
+ #
52
+ # @parameterized.parameters(*_HISTOGRAM_TESTS)
53
+ # def testHistogram(self, n, dtype, max_val):
54
+ # x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
55
+ #
56
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
57
+ # arguments = {
58
+ # 'n': n,
59
+ # 'dtype': dtype,
60
+ # 'max_val': max_val,
61
+ # }
62
+ # log_benchmark(arguments, mean_t, std_t)
63
+ #
64
+ # @parameterized.parameters(*_HISTOGRAM_TESTS)
65
+ # def testTorchHistogram(self, n, dtype, max_val):
66
+ # x = torch.randint(0, 128, (n,)).cuda().to(dtype)
67
+ #
68
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
69
+ # arguments = {
70
+ # 'n': n,
71
+ # 'dtype': dtype,
72
+ # 'max_val': max_val,
73
+ # }
74
+ # log_benchmark(arguments, mean_t, std_t)
75
 
76
 
77
  if __name__ == '__main__':
build/torch210-cxx11-cu126-x86_64-linux/ops/matmul_benchmark.py CHANGED
@@ -17,7 +17,7 @@ import unittest
17
  from .. import stk
18
 
19
  import torch
20
- from absl.testing import parameterized
21
 
22
  from .. import benchmark_util, ops
23
 
@@ -48,367 +48,367 @@ def log_benchmark(name, arguments, time, std, flops):
48
  print('=' * 60)
49
 
50
 
51
- class MatmulBenchmark(parameterized.TestCase):
52
-
53
- def build_sparse_matrix(self, x, padded_bins, fhs, ne):
54
- blocking = 128
55
- padded_tokens, _ = x.size()
56
- assert padded_tokens % blocking == 0
57
- assert fhs % blocking == 0
58
-
59
- # Offsets for the sparse matrix. All rows have the
60
- # same number of nonzero blocks dictated by the
61
- # dimensionality of a single expert.
62
- block_rows = padded_tokens // blocking
63
- blocks_per_row = fhs // blocking
64
- offsets = torch.arange(
65
- 0,
66
- block_rows * blocks_per_row + 1,
67
- blocks_per_row,
68
- dtype=torch.int32,
69
- device=x.device,
70
- )
71
-
72
- # Indices for the sparse matrix. The indices for
73
- # the intermediate matrix are dynamic depending
74
- # on the mapping of tokens to experts.
75
- column_indices = ops.topology(
76
- padded_bins,
77
- blocking,
78
- block_rows,
79
- blocks_per_row,
80
- )
81
- data = torch.empty(
82
- column_indices.numel(),
83
- blocking,
84
- blocking,
85
- dtype=torch.float16,
86
- device=x.device,
87
- )
88
- shape = (padded_tokens, fhs * ne)
89
- row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
90
- return stk.Matrix(shape, data, row_indices, column_indices, offsets)
91
-
92
- def build_input_matrix(self, sl, hs, ne):
93
- x = torch.randn((sl, hs)).cuda().half()
94
-
95
- # Assign tokens to experts uniformly.
96
- top_expert = torch.arange(0, sl).cuda().int() % ne
97
-
98
- bin_ids, indices = ops.sort(top_expert)
99
- tokens_per_expert = ops.histogram(top_expert, ne)
100
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
101
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
102
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
103
- out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
104
- return out, padded_bins
105
-
106
- def build_weight_matrix(self, ne, hs, fhs):
107
- return torch.randn((hs, ne * fhs)).cuda().half()
108
-
109
- @parameterized.parameters(*_MATMUL_TESTS)
110
- def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
111
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
112
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
113
- topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
114
- w = transpose_view(w)
115
-
116
- def benchmark():
117
- return stk.ops.sdd(x, w, topo)
118
-
119
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
- arguments = {
121
- 'sequence_length': sl,
122
- 'hidden_size': hs,
123
- 'ffn_hidden_size': fhs,
124
- 'num_experts': ne,
125
- }
126
- log_benchmark(
127
- '0::Fwd::SDD::NT',
128
- arguments,
129
- mean_t,
130
- std_t,
131
- x.numel() * fhs * 2,
132
- )
133
-
134
- @parameterized.parameters(*_MATMUL_TESTS)
135
- def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
136
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
137
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
138
- topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
139
-
140
- def benchmark():
141
- return stk.ops.dsd(topo, w)
142
-
143
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
144
- arguments = {
145
- 'sequence_length': sl,
146
- 'hidden_size': hs,
147
- 'ffn_hidden_size': fhs,
148
- 'num_experts': ne,
149
- }
150
- log_benchmark(
151
- '0::GradX::DSD::NN',
152
- arguments,
153
- mean_t,
154
- std_t,
155
- x.numel() * fhs * 2,
156
- )
157
-
158
- @parameterized.parameters(*_MATMUL_TESTS)
159
- def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
160
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
161
- topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
162
- topo = topo.t()
163
-
164
- def benchmark():
165
- return stk.ops.dsd(topo, x)
166
-
167
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
168
- arguments = {
169
- 'sequence_length': sl,
170
- 'hidden_size': hs,
171
- 'ffn_hidden_size': fhs,
172
- 'num_experts': ne,
173
- }
174
- log_benchmark(
175
- '0::GradW::DSD::TN',
176
- arguments,
177
- mean_t,
178
- std_t,
179
- x.numel() * fhs * 2,
180
- )
181
-
182
- @parameterized.parameters(*_MATMUL_TESTS)
183
- def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
184
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
185
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
186
- x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
187
-
188
- def benchmark():
189
- return stk.ops.dsd(x, w)
190
-
191
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
192
- arguments = {
193
- 'sequence_length': sl,
194
- 'hidden_size': hs,
195
- 'ffn_hidden_size': fhs,
196
- 'num_experts': ne,
197
- }
198
- log_benchmark(
199
- '1::Fwd::DSD::NN',
200
- arguments,
201
- mean_t,
202
- std_t,
203
- x.nnz * hs * 2,
204
- )
205
-
206
- @parameterized.parameters(*_MATMUL_TESTS)
207
- def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
208
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
209
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
210
- x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
211
- out = stk.ops.dsd(x, w)
212
- w = transpose_view(w)
213
-
214
- def benchmark():
215
- return stk.ops.sdd(out, w, x)
216
-
217
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
218
- arguments = {
219
- 'sequence_length': sl,
220
- 'hidden_size': hs,
221
- 'ffn_hidden_size': fhs,
222
- 'num_experts': ne,
223
- }
224
- log_benchmark(
225
- '1::GradX::SDD::NT',
226
- arguments,
227
- mean_t,
228
- std_t,
229
- x.nnz * hs * 2,
230
- )
231
-
232
- @parameterized.parameters(*_MATMUL_TESTS)
233
- def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
234
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
235
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
236
- x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
237
- out = stk.ops.dsd(x, w)
238
- x = x.t()
239
-
240
- def benchmark():
241
- return stk.ops.dsd(x, out)
242
-
243
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
244
- arguments = {
245
- 'sequence_length': sl,
246
- 'hidden_size': hs,
247
- 'ffn_hidden_size': fhs,
248
- 'num_experts': ne,
249
- }
250
- log_benchmark(
251
- '1::GradW::DSD::TN',
252
- arguments,
253
- mean_t,
254
- std_t,
255
- x.nnz * hs * 2,
256
- )
257
-
258
- @parameterized.parameters(*_MATMUL_TESTS)
259
- def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
260
- assert (sl % ne) == 0
261
- x = torch.randn((ne, sl // ne, hs)).cuda().half()
262
- w = torch.randn((ne, hs, fhs)).cuda().half()
263
-
264
- w = w.transpose(1, 2).contiguous()
265
- w = w.transpose(1, 2)
266
-
267
- def benchmark():
268
- return torch.bmm(x, w)
269
-
270
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
271
- arguments = {
272
- 'sequence_length': sl,
273
- 'hidden_size': hs,
274
- 'ffn_hidden_size': fhs,
275
- 'num_experts': ne,
276
- }
277
- log_benchmark(
278
- '0::Fwd:DDD::NT',
279
- arguments,
280
- mean_t,
281
- std_t,
282
- x.numel() * fhs * 2,
283
- )
284
-
285
- @parameterized.parameters(*_MATMUL_TESTS)
286
- def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
287
- assert (sl % ne) == 0
288
- x = torch.randn((ne, sl // ne, hs)).cuda().half()
289
- w = torch.randn((ne, hs, fhs)).cuda().half()
290
- out = torch.bmm(x, w)
291
- w = w.transpose(1, 2).contiguous()
292
-
293
- def benchmark():
294
- return torch.bmm(out, w)
295
-
296
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
297
- arguments = {
298
- 'sequence_length': sl,
299
- 'hidden_size': hs,
300
- 'ffn_hidden_size': fhs,
301
- 'num_experts': ne,
302
- }
303
- log_benchmark(
304
- '0:GradX:DDD::NN',
305
- arguments,
306
- mean_t,
307
- std_t,
308
- x.numel() * fhs * 2,
309
- )
310
-
311
- @parameterized.parameters(*_MATMUL_TESTS)
312
- def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
313
- assert (sl % ne) == 0
314
- x = torch.randn((ne, sl // ne, hs)).cuda().half()
315
- w = torch.randn((ne, hs, fhs)).cuda().half()
316
- out = torch.bmm(x, w)
317
- out = out.transpose(1, 2)
318
-
319
- def benchmark():
320
- return torch.bmm(out, x)
321
-
322
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
323
- arguments = {
324
- 'sequence_length': sl,
325
- 'hidden_size': hs,
326
- 'ffn_hidden_size': fhs,
327
- 'num_experts': ne,
328
- }
329
- log_benchmark(
330
- '0:GradW:DDD::TN',
331
- arguments,
332
- mean_t,
333
- std_t,
334
- x.numel() * fhs * 2,
335
- )
336
-
337
- @parameterized.parameters(*_MATMUL_TESTS)
338
- def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
339
- assert (sl % ne) == 0
340
- x = torch.randn((ne, sl // ne, fhs)).cuda().half()
341
- w = torch.randn((ne, fhs, hs)).cuda().half()
342
-
343
- def benchmark():
344
- return torch.bmm(x, w)
345
-
346
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
347
- arguments = {
348
- 'sequence_length': sl,
349
- 'hidden_size': hs,
350
- 'ffn_hidden_size': fhs,
351
- 'num_experts': ne,
352
- }
353
- log_benchmark(
354
- '1::Fwd::DDD::NN',
355
- arguments,
356
- mean_t,
357
- std_t,
358
- x.numel() * hs * 2,
359
- )
360
-
361
- @parameterized.parameters(*_MATMUL_TESTS)
362
- def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
363
- assert (sl % ne) == 0
364
- x = torch.randn((ne, sl // ne, fhs)).cuda().half()
365
- w = torch.randn((ne, fhs, hs)).cuda().half()
366
- out = torch.bmm(x, w)
367
- w = torch.transpose(w, 1, 2)
368
-
369
- def benchmark():
370
- return torch.bmm(out, w)
371
-
372
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
373
- arguments = {
374
- 'sequence_length': sl,
375
- 'hidden_size': hs,
376
- 'ffn_hidden_size': fhs,
377
- 'num_experts': ne,
378
- }
379
- log_benchmark(
380
- '1::GradX::DDD::NT',
381
- arguments,
382
- mean_t,
383
- std_t,
384
- x.numel() * hs * 2,
385
- )
386
-
387
- @parameterized.parameters(*_MATMUL_TESTS)
388
- def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
389
- assert (sl % ne) == 0
390
- x = torch.randn((ne, sl // ne, fhs)).cuda().half()
391
- w = torch.randn((ne, fhs, hs)).cuda().half()
392
- out = torch.bmm(x, w)
393
- x = torch.transpose(x, 1, 2)
394
-
395
- def benchmark():
396
- return torch.bmm(x, out)
397
-
398
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
399
- arguments = {
400
- 'sequence_length': sl,
401
- 'hidden_size': hs,
402
- 'ffn_hidden_size': fhs,
403
- 'num_experts': ne,
404
- }
405
- log_benchmark(
406
- '1::GradW::DDD::TN',
407
- arguments,
408
- mean_t,
409
- std_t,
410
- x.numel() * hs * 2,
411
- )
412
 
413
 
414
  if __name__ == '__main__':
 
17
  from .. import stk
18
 
19
  import torch
20
+ # from absl.testing import parameterized
21
 
22
  from .. import benchmark_util, ops
23
 
 
48
  print('=' * 60)
49
 
50
 
51
+ # class MatmulBenchmark(parameterized.TestCase):
52
+ #
53
+ # def build_sparse_matrix(self, x, padded_bins, fhs, ne):
54
+ # blocking = 128
55
+ # padded_tokens, _ = x.size()
56
+ # assert padded_tokens % blocking == 0
57
+ # assert fhs % blocking == 0
58
+ #
59
+ # # Offsets for the sparse matrix. All rows have the
60
+ # # same number of nonzero blocks dictated by the
61
+ # # dimensionality of a single expert.
62
+ # block_rows = padded_tokens // blocking
63
+ # blocks_per_row = fhs // blocking
64
+ # offsets = torch.arange(
65
+ # 0,
66
+ # block_rows * blocks_per_row + 1,
67
+ # blocks_per_row,
68
+ # dtype=torch.int32,
69
+ # device=x.device,
70
+ # )
71
+ #
72
+ # # Indices for the sparse matrix. The indices for
73
+ # # the intermediate matrix are dynamic depending
74
+ # # on the mapping of tokens to experts.
75
+ # column_indices = ops.topology(
76
+ # padded_bins,
77
+ # blocking,
78
+ # block_rows,
79
+ # blocks_per_row,
80
+ # )
81
+ # data = torch.empty(
82
+ # column_indices.numel(),
83
+ # blocking,
84
+ # blocking,
85
+ # dtype=torch.float16,
86
+ # device=x.device,
87
+ # )
88
+ # shape = (padded_tokens, fhs * ne)
89
+ # row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
90
+ # return stk.Matrix(shape, data, row_indices, column_indices, offsets)
91
+ #
92
+ # def build_input_matrix(self, sl, hs, ne):
93
+ # x = torch.randn((sl, hs)).cuda().half()
94
+ #
95
+ # # Assign tokens to experts uniformly.
96
+ # top_expert = torch.arange(0, sl).cuda().int() % ne
97
+ #
98
+ # bin_ids, indices = ops.sort(top_expert)
99
+ # tokens_per_expert = ops.histogram(top_expert, ne)
100
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
101
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
102
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
103
+ # out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
104
+ # return out, padded_bins
105
+ #
106
+ # def build_weight_matrix(self, ne, hs, fhs):
107
+ # return torch.randn((hs, ne * fhs)).cuda().half()
108
+ #
109
+ # @parameterized.parameters(*_MATMUL_TESTS)
110
+ # def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
111
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
112
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
113
+ # topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
114
+ # w = transpose_view(w)
115
+ #
116
+ # def benchmark():
117
+ # return stk.ops.sdd(x, w, topo)
118
+ #
119
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
+ # arguments = {
121
+ # 'sequence_length': sl,
122
+ # 'hidden_size': hs,
123
+ # 'ffn_hidden_size': fhs,
124
+ # 'num_experts': ne,
125
+ # }
126
+ # log_benchmark(
127
+ # '0::Fwd::SDD::NT',
128
+ # arguments,
129
+ # mean_t,
130
+ # std_t,
131
+ # x.numel() * fhs * 2,
132
+ # )
133
+ #
134
+ # @parameterized.parameters(*_MATMUL_TESTS)
135
+ # def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
136
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
137
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
138
+ # topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
139
+ #
140
+ # def benchmark():
141
+ # return stk.ops.dsd(topo, w)
142
+ #
143
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
144
+ # arguments = {
145
+ # 'sequence_length': sl,
146
+ # 'hidden_size': hs,
147
+ # 'ffn_hidden_size': fhs,
148
+ # 'num_experts': ne,
149
+ # }
150
+ # log_benchmark(
151
+ # '0::GradX::DSD::NN',
152
+ # arguments,
153
+ # mean_t,
154
+ # std_t,
155
+ # x.numel() * fhs * 2,
156
+ # )
157
+ #
158
+ # @parameterized.parameters(*_MATMUL_TESTS)
159
+ # def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
160
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
161
+ # topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
162
+ # topo = topo.t()
163
+ #
164
+ # def benchmark():
165
+ # return stk.ops.dsd(topo, x)
166
+ #
167
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
168
+ # arguments = {
169
+ # 'sequence_length': sl,
170
+ # 'hidden_size': hs,
171
+ # 'ffn_hidden_size': fhs,
172
+ # 'num_experts': ne,
173
+ # }
174
+ # log_benchmark(
175
+ # '0::GradW::DSD::TN',
176
+ # arguments,
177
+ # mean_t,
178
+ # std_t,
179
+ # x.numel() * fhs * 2,
180
+ # )
181
+ #
182
+ # @parameterized.parameters(*_MATMUL_TESTS)
183
+ # def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
184
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
185
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
186
+ # x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
187
+ #
188
+ # def benchmark():
189
+ # return stk.ops.dsd(x, w)
190
+ #
191
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
192
+ # arguments = {
193
+ # 'sequence_length': sl,
194
+ # 'hidden_size': hs,
195
+ # 'ffn_hidden_size': fhs,
196
+ # 'num_experts': ne,
197
+ # }
198
+ # log_benchmark(
199
+ # '1::Fwd::DSD::NN',
200
+ # arguments,
201
+ # mean_t,
202
+ # std_t,
203
+ # x.nnz * hs * 2,
204
+ # )
205
+ #
206
+ # @parameterized.parameters(*_MATMUL_TESTS)
207
+ # def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
208
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
209
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
210
+ # x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
211
+ # out = stk.ops.dsd(x, w)
212
+ # w = transpose_view(w)
213
+ #
214
+ # def benchmark():
215
+ # return stk.ops.sdd(out, w, x)
216
+ #
217
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
218
+ # arguments = {
219
+ # 'sequence_length': sl,
220
+ # 'hidden_size': hs,
221
+ # 'ffn_hidden_size': fhs,
222
+ # 'num_experts': ne,
223
+ # }
224
+ # log_benchmark(
225
+ # '1::GradX::SDD::NT',
226
+ # arguments,
227
+ # mean_t,
228
+ # std_t,
229
+ # x.nnz * hs * 2,
230
+ # )
231
+ #
232
+ # @parameterized.parameters(*_MATMUL_TESTS)
233
+ # def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
234
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
235
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
236
+ # x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
237
+ # out = stk.ops.dsd(x, w)
238
+ # x = x.t()
239
+ #
240
+ # def benchmark():
241
+ # return stk.ops.dsd(x, out)
242
+ #
243
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
244
+ # arguments = {
245
+ # 'sequence_length': sl,
246
+ # 'hidden_size': hs,
247
+ # 'ffn_hidden_size': fhs,
248
+ # 'num_experts': ne,
249
+ # }
250
+ # log_benchmark(
251
+ # '1::GradW::DSD::TN',
252
+ # arguments,
253
+ # mean_t,
254
+ # std_t,
255
+ # x.nnz * hs * 2,
256
+ # )
257
+ #
258
+ # @parameterized.parameters(*_MATMUL_TESTS)
259
+ # def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
260
+ # assert (sl % ne) == 0
261
+ # x = torch.randn((ne, sl // ne, hs)).cuda().half()
262
+ # w = torch.randn((ne, hs, fhs)).cuda().half()
263
+ #
264
+ # w = w.transpose(1, 2).contiguous()
265
+ # w = w.transpose(1, 2)
266
+ #
267
+ # def benchmark():
268
+ # return torch.bmm(x, w)
269
+ #
270
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
271
+ # arguments = {
272
+ # 'sequence_length': sl,
273
+ # 'hidden_size': hs,
274
+ # 'ffn_hidden_size': fhs,
275
+ # 'num_experts': ne,
276
+ # }
277
+ # log_benchmark(
278
+ # '0::Fwd:DDD::NT',
279
+ # arguments,
280
+ # mean_t,
281
+ # std_t,
282
+ # x.numel() * fhs * 2,
283
+ # )
284
+ #
285
+ # @parameterized.parameters(*_MATMUL_TESTS)
286
+ # def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
287
+ # assert (sl % ne) == 0
288
+ # x = torch.randn((ne, sl // ne, hs)).cuda().half()
289
+ # w = torch.randn((ne, hs, fhs)).cuda().half()
290
+ # out = torch.bmm(x, w)
291
+ # w = w.transpose(1, 2).contiguous()
292
+ #
293
+ # def benchmark():
294
+ # return torch.bmm(out, w)
295
+ #
296
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
297
+ # arguments = {
298
+ # 'sequence_length': sl,
299
+ # 'hidden_size': hs,
300
+ # 'ffn_hidden_size': fhs,
301
+ # 'num_experts': ne,
302
+ # }
303
+ # log_benchmark(
304
+ # '0:GradX:DDD::NN',
305
+ # arguments,
306
+ # mean_t,
307
+ # std_t,
308
+ # x.numel() * fhs * 2,
309
+ # )
310
+ #
311
+ # @parameterized.parameters(*_MATMUL_TESTS)
312
+ # def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
313
+ # assert (sl % ne) == 0
314
+ # x = torch.randn((ne, sl // ne, hs)).cuda().half()
315
+ # w = torch.randn((ne, hs, fhs)).cuda().half()
316
+ # out = torch.bmm(x, w)
317
+ # out = out.transpose(1, 2)
318
+ #
319
+ # def benchmark():
320
+ # return torch.bmm(out, x)
321
+ #
322
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
323
+ # arguments = {
324
+ # 'sequence_length': sl,
325
+ # 'hidden_size': hs,
326
+ # 'ffn_hidden_size': fhs,
327
+ # 'num_experts': ne,
328
+ # }
329
+ # log_benchmark(
330
+ # '0:GradW:DDD::TN',
331
+ # arguments,
332
+ # mean_t,
333
+ # std_t,
334
+ # x.numel() * fhs * 2,
335
+ # )
336
+ #
337
+ # @parameterized.parameters(*_MATMUL_TESTS)
338
+ # def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
339
+ # assert (sl % ne) == 0
340
+ # x = torch.randn((ne, sl // ne, fhs)).cuda().half()
341
+ # w = torch.randn((ne, fhs, hs)).cuda().half()
342
+ #
343
+ # def benchmark():
344
+ # return torch.bmm(x, w)
345
+ #
346
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
347
+ # arguments = {
348
+ # 'sequence_length': sl,
349
+ # 'hidden_size': hs,
350
+ # 'ffn_hidden_size': fhs,
351
+ # 'num_experts': ne,
352
+ # }
353
+ # log_benchmark(
354
+ # '1::Fwd::DDD::NN',
355
+ # arguments,
356
+ # mean_t,
357
+ # std_t,
358
+ # x.numel() * hs * 2,
359
+ # )
360
+ #
361
+ # @parameterized.parameters(*_MATMUL_TESTS)
362
+ # def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
363
+ # assert (sl % ne) == 0
364
+ # x = torch.randn((ne, sl // ne, fhs)).cuda().half()
365
+ # w = torch.randn((ne, fhs, hs)).cuda().half()
366
+ # out = torch.bmm(x, w)
367
+ # w = torch.transpose(w, 1, 2)
368
+ #
369
+ # def benchmark():
370
+ # return torch.bmm(out, w)
371
+ #
372
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
373
+ # arguments = {
374
+ # 'sequence_length': sl,
375
+ # 'hidden_size': hs,
376
+ # 'ffn_hidden_size': fhs,
377
+ # 'num_experts': ne,
378
+ # }
379
+ # log_benchmark(
380
+ # '1::GradX::DDD::NT',
381
+ # arguments,
382
+ # mean_t,
383
+ # std_t,
384
+ # x.numel() * hs * 2,
385
+ # )
386
+ #
387
+ # @parameterized.parameters(*_MATMUL_TESTS)
388
+ # def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
389
+ # assert (sl % ne) == 0
390
+ # x = torch.randn((ne, sl // ne, fhs)).cuda().half()
391
+ # w = torch.randn((ne, fhs, hs)).cuda().half()
392
+ # out = torch.bmm(x, w)
393
+ # x = torch.transpose(x, 1, 2)
394
+ #
395
+ # def benchmark():
396
+ # return torch.bmm(x, out)
397
+ #
398
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
399
+ # arguments = {
400
+ # 'sequence_length': sl,
401
+ # 'hidden_size': hs,
402
+ # 'ffn_hidden_size': fhs,
403
+ # 'num_experts': ne,
404
+ # }
405
+ # log_benchmark(
406
+ # '1::GradW::DDD::TN',
407
+ # arguments,
408
+ # mean_t,
409
+ # std_t,
410
+ # x.numel() * hs * 2,
411
+ # )
412
 
413
 
414
  if __name__ == '__main__':
build/torch210-cxx11-cu126-x86_64-linux/ops/padded_scatter_benchmark.py CHANGED
@@ -4,7 +4,7 @@
4
  import unittest
5
 
6
  import torch
7
- from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
@@ -16,50 +16,50 @@ _PADDED_SCATTER_BENCHMARK = (
16
  )
17
 
18
 
19
- class PaddedScatterTest(parameterized.TestCase):
20
-
21
- @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
22
- def testPaddedScatter(self, sl, hs, ne, top_k):
23
- # Create the data and indices.
24
- x = torch.randn((sl, hs)).cuda().half()
25
-
26
- # Randomly assign tokens to experts.
27
- top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
28
- bin_ids, indices = ops.sort(top_expert)
29
- tokens_per_expert = ops.histogram(top_expert, ne)
30
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
31
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
32
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
33
-
34
- # Sample weights for the scatter reduce.
35
- weights = torch.rand((sl * top_k,)).cuda().half()
36
-
37
- # Gather the data to prepare for backwards.
38
- x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
39
-
40
- def benchmark():
41
- return ops.padded_scatter(
42
- x,
43
- indices,
44
- bin_ids,
45
- weights,
46
- bins,
47
- padded_bins,
48
- top_k,
49
- )
50
-
51
- time, std = benchmark_util.benchmark_function(benchmark)
52
- benchmark_util.log_benchmark(
53
- 'Padded Scatter',
54
- {
55
- 'sequence_length': sl,
56
- 'hidden_size': hs,
57
- 'num_experts': ne,
58
- 'top_k': top_k,
59
- },
60
- time,
61
- std,
62
- )
63
 
64
 
65
  if __name__ == '__main__':
 
4
  import unittest
5
 
6
  import torch
7
+ # from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
 
16
  )
17
 
18
 
19
+ # class PaddedScatterTest(parameterized.TestCase):
20
+ #
21
+ # @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
22
+ # def testPaddedScatter(self, sl, hs, ne, top_k):
23
+ # # Create the data and indices.
24
+ # x = torch.randn((sl, hs)).cuda().half()
25
+ #
26
+ # # Randomly assign tokens to experts.
27
+ # top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
28
+ # bin_ids, indices = ops.sort(top_expert)
29
+ # tokens_per_expert = ops.histogram(top_expert, ne)
30
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
31
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
32
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
33
+ #
34
+ # # Sample weights for the scatter reduce.
35
+ # weights = torch.rand((sl * top_k,)).cuda().half()
36
+ #
37
+ # # Gather the data to prepare for backwards.
38
+ # x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
39
+ #
40
+ # def benchmark():
41
+ # return ops.padded_scatter(
42
+ # x,
43
+ # indices,
44
+ # bin_ids,
45
+ # weights,
46
+ # bins,
47
+ # padded_bins,
48
+ # top_k,
49
+ # )
50
+ #
51
+ # time, std = benchmark_util.benchmark_function(benchmark)
52
+ # benchmark_util.log_benchmark(
53
+ # 'Padded Scatter',
54
+ # {
55
+ # 'sequence_length': sl,
56
+ # 'hidden_size': hs,
57
+ # 'num_experts': ne,
58
+ # 'top_k': top_k,
59
+ # },
60
+ # time,
61
+ # std,
62
+ # )
63
 
64
 
65
  if __name__ == '__main__':
build/torch210-cxx11-cu126-x86_64-linux/ops/permute_benchmark.py CHANGED
@@ -4,7 +4,7 @@
4
  import unittest
5
 
6
  import torch
7
- from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
@@ -26,123 +26,123 @@ _PERMUTE_TESTS = (
26
  )
27
 
28
 
29
- class PermuteBenchmark(parameterized.TestCase):
30
-
31
- @parameterized.parameters(*_PERMUTE_TESTS)
32
- def testBinnedGather(self, sl, hs, ne):
33
- # NOTE: Capacity factor == 1.
34
- ec = sl // ne
35
-
36
- # Create the data and indices.
37
- x = torch.randn((sl, hs)).cuda().half()
38
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
39
- bin_ids, indices = ops.sort(top_expert)
40
- tokens_per_expert = ops.histogram(indices, ne)
41
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
42
-
43
- def benchmark():
44
- return ops.binned_gather(x, indices, bins, ec)
45
-
46
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
47
- arguments = {
48
- 'sequence_length': sl,
49
- 'hidden_size': hs,
50
- 'num_experts': ne,
51
- }
52
- benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
53
-
54
- @parameterized.parameters(*_PERMUTE_TESTS)
55
- def testBinnedScatter(self, sl, hs, ne):
56
- # NOTE: Capacity factor == 1.
57
- ec = sl // ne
58
-
59
- # Create the data and indices.
60
- x = torch.randn((sl, hs)).cuda().half()
61
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
62
- bin_ids, indices = ops.sort(top_expert)
63
- tokens_per_expert = ops.histogram(indices, ne)
64
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
65
- x = ops.binned_gather(x, indices, bins, ec)
66
-
67
- def benchmark():
68
- return ops.binned_scatter(x, indices, bins)
69
-
70
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
71
- arguments = {
72
- 'sequence_length': sl,
73
- 'hidden_size': hs,
74
- 'num_experts': ne,
75
- }
76
- benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
77
-
78
- @parameterized.parameters(*_PERMUTE_TESTS)
79
- def testPaddedGather(self, sl, hs, ne):
80
- # Create the data and indices.
81
- x = torch.randn((sl, hs)).cuda().half()
82
-
83
- # Randomly assign tokens to experts.
84
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
85
- bin_ids, indices = ops.sort(top_expert)
86
- tokens_per_expert = ops.histogram(top_expert, ne)
87
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
88
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
89
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
90
-
91
- def benchmark():
92
- return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
93
-
94
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
95
- arguments = {
96
- 'sequence_length': sl,
97
- 'hidden_size': hs,
98
- 'num_experts': ne,
99
- }
100
- benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
101
-
102
- @parameterized.parameters(*_PERMUTE_TESTS)
103
- def testPaddedScatter(self, sl, hs, ne):
104
- # Create the data and indices.
105
- x = torch.randn((sl, hs)).cuda().half()
106
-
107
- # Randomly assign tokens to experts.
108
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
109
- bin_ids, indices = ops.sort(top_expert)
110
- tokens_per_expert = ops.histogram(top_expert, ne)
111
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
112
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
113
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
114
- x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
115
-
116
- def benchmark():
117
- return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
118
-
119
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
- arguments = {
121
- 'sequence_length': sl,
122
- 'hidden_size': hs,
123
- 'num_experts': ne,
124
- }
125
- benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
126
-
127
- @parameterized.parameters(*_PERMUTE_TESTS)
128
- def testCopy(self, sl, hs, ne):
129
- # NOTE: Capacity factor == 1.
130
- # ec = sl // ne
131
-
132
- # Create the data and indices.
133
- x = torch.randn((sl, hs)).cuda().half()
134
- y = x.clone()
135
-
136
- def benchmark():
137
- return y.copy_(x)
138
-
139
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
140
- arguments = {
141
- 'sequence_length': sl,
142
- 'hidden_size': hs,
143
- 'num_experts': ne,
144
- }
145
- benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
146
 
147
 
148
  if __name__ == '__main__':
 
4
  import unittest
5
 
6
  import torch
7
+ # from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
 
26
  )
27
 
28
 
29
+ # class PermuteBenchmark(parameterized.TestCase):
30
+ #
31
+ # @parameterized.parameters(*_PERMUTE_TESTS)
32
+ # def testBinnedGather(self, sl, hs, ne):
33
+ # # NOTE: Capacity factor == 1.
34
+ # ec = sl // ne
35
+ #
36
+ # # Create the data and indices.
37
+ # x = torch.randn((sl, hs)).cuda().half()
38
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
39
+ # bin_ids, indices = ops.sort(top_expert)
40
+ # tokens_per_expert = ops.histogram(indices, ne)
41
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
42
+ #
43
+ # def benchmark():
44
+ # return ops.binned_gather(x, indices, bins, ec)
45
+ #
46
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
47
+ # arguments = {
48
+ # 'sequence_length': sl,
49
+ # 'hidden_size': hs,
50
+ # 'num_experts': ne,
51
+ # }
52
+ # benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
53
+ #
54
+ # @parameterized.parameters(*_PERMUTE_TESTS)
55
+ # def testBinnedScatter(self, sl, hs, ne):
56
+ # # NOTE: Capacity factor == 1.
57
+ # ec = sl // ne
58
+ #
59
+ # # Create the data and indices.
60
+ # x = torch.randn((sl, hs)).cuda().half()
61
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
62
+ # bin_ids, indices = ops.sort(top_expert)
63
+ # tokens_per_expert = ops.histogram(indices, ne)
64
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
65
+ # x = ops.binned_gather(x, indices, bins, ec)
66
+ #
67
+ # def benchmark():
68
+ # return ops.binned_scatter(x, indices, bins)
69
+ #
70
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
71
+ # arguments = {
72
+ # 'sequence_length': sl,
73
+ # 'hidden_size': hs,
74
+ # 'num_experts': ne,
75
+ # }
76
+ # benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
77
+ #
78
+ # @parameterized.parameters(*_PERMUTE_TESTS)
79
+ # def testPaddedGather(self, sl, hs, ne):
80
+ # # Create the data and indices.
81
+ # x = torch.randn((sl, hs)).cuda().half()
82
+ #
83
+ # # Randomly assign tokens to experts.
84
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
85
+ # bin_ids, indices = ops.sort(top_expert)
86
+ # tokens_per_expert = ops.histogram(top_expert, ne)
87
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
88
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
89
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
90
+ #
91
+ # def benchmark():
92
+ # return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
93
+ #
94
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
95
+ # arguments = {
96
+ # 'sequence_length': sl,
97
+ # 'hidden_size': hs,
98
+ # 'num_experts': ne,
99
+ # }
100
+ # benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
101
+ #
102
+ # @parameterized.parameters(*_PERMUTE_TESTS)
103
+ # def testPaddedScatter(self, sl, hs, ne):
104
+ # # Create the data and indices.
105
+ # x = torch.randn((sl, hs)).cuda().half()
106
+ #
107
+ # # Randomly assign tokens to experts.
108
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
109
+ # bin_ids, indices = ops.sort(top_expert)
110
+ # tokens_per_expert = ops.histogram(top_expert, ne)
111
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
112
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
113
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
114
+ # x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
115
+ #
116
+ # def benchmark():
117
+ # return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
118
+ #
119
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
+ # arguments = {
121
+ # 'sequence_length': sl,
122
+ # 'hidden_size': hs,
123
+ # 'num_experts': ne,
124
+ # }
125
+ # benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
126
+ #
127
+ # @parameterized.parameters(*_PERMUTE_TESTS)
128
+ # def testCopy(self, sl, hs, ne):
129
+ # # NOTE: Capacity factor == 1.
130
+ # # ec = sl // ne
131
+ #
132
+ # # Create the data and indices.
133
+ # x = torch.randn((sl, hs)).cuda().half()
134
+ # y = x.clone()
135
+ #
136
+ # def benchmark():
137
+ # return y.copy_(x)
138
+ #
139
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
140
+ # arguments = {
141
+ # 'sequence_length': sl,
142
+ # 'hidden_size': hs,
143
+ # 'num_experts': ne,
144
+ # }
145
+ # benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
146
 
147
 
148
  if __name__ == '__main__':
build/torch210-cxx11-cu126-x86_64-linux/ops/sort_benchmark.py CHANGED
@@ -5,7 +5,7 @@ import unittest
5
 
6
  import numpy as np
7
  import torch
8
- from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
@@ -53,32 +53,32 @@ def log_benchmark(arguments, mean_t, std_t):
53
  print('=' * 60)
54
 
55
 
56
- class SortBenchmark(parameterized.TestCase):
57
-
58
- @parameterized.parameters(*_SORT_TESTS)
59
- def testSort(self, n, dtype, max_val):
60
- if max_val is None:
61
- max_val = np.iinfo(numpy_dtype(dtype)).max
62
- end_bit = int(np.ceil(np.log2(max_val)))
63
- x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
64
-
65
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
66
- arguments = {
67
- 'n': n,
68
- 'dtype': dtype,
69
- 'max_val': max_val,
70
- }
71
- log_benchmark(arguments, mean_t, std_t)
72
-
73
- @parameterized.parameters(*_BASELINE_SORT_TESTS)
74
- def testTorchSort(self, n):
75
- x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
76
-
77
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
78
- arguments = {
79
- 'n': n,
80
- }
81
- log_benchmark(arguments, mean_t, std_t)
82
 
83
 
84
  if __name__ == '__main__':
 
5
 
6
  import numpy as np
7
  import torch
8
+ # from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
 
53
  print('=' * 60)
54
 
55
 
56
+ # class SortBenchmark(parameterized.TestCase):
57
+ #
58
+ # @parameterized.parameters(*_SORT_TESTS)
59
+ # def testSort(self, n, dtype, max_val):
60
+ # if max_val is None:
61
+ # max_val = np.iinfo(numpy_dtype(dtype)).max
62
+ # end_bit = int(np.ceil(np.log2(max_val)))
63
+ # x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
64
+ #
65
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
66
+ # arguments = {
67
+ # 'n': n,
68
+ # 'dtype': dtype,
69
+ # 'max_val': max_val,
70
+ # }
71
+ # log_benchmark(arguments, mean_t, std_t)
72
+ #
73
+ # @parameterized.parameters(*_BASELINE_SORT_TESTS)
74
+ # def testTorchSort(self, n):
75
+ # x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
76
+ #
77
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
78
+ # arguments = {
79
+ # 'n': n,
80
+ # }
81
+ # log_benchmark(arguments, mean_t, std_t)
82
 
83
 
84
  if __name__ == '__main__':
build/torch210-cxx11-cu126-x86_64-linux/stk/ops/eltwise_ops_test.py CHANGED
@@ -1,7 +1,7 @@
1
  import unittest
2
  import itertools
3
  import torch
4
- from absl.testing import parameterized
5
 
6
  import stk
7
  from stk.ops.linear_ops_test import allclose, _dense_and_sparse
@@ -47,40 +47,40 @@ def _dense_and_sparse_like(x, std=0.1):
47
  return (dense.requires_grad_(True),
48
  sparse.requires_grad_(True))
49
 
50
- @parameterized.parameters(_ELTWISE_OP_TESTS)
51
- class EltwiseOpsTest(parameterized.TestCase):
52
-
53
- def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
54
-
55
- a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
56
- b_dense, b = _dense_and_sparse_like(a)
57
-
58
- out = stk.ops.mul(a, b)
59
- expected_out = torch.mul(a_dense, b_dense)
60
-
61
- # Compute the gradients w.r.t. the inputs.
62
- expected_out.sum().backward()
63
- stk.ops.sum(out).backward()
64
-
65
- # Validate the results.
66
- out = stk.ops.to_dense(out)
67
- self.assertEqual(out.dim(), 2)
68
- self.assertEqual(expected_out.size(), out.size())
69
- self.assertTrue(allclose(out, expected_out))
70
-
71
- # LHS gradient.
72
- grad = stk.ops.to_dense(a.grad)
73
- expected_grad = a_dense.grad
74
- self.assertEqual(grad.dim(), 2)
75
- self.assertEqual(expected_grad.size(), grad.size())
76
- self.assertTrue(allclose(grad, expected_grad))
77
-
78
- # RHS gradient.
79
- grad = stk.ops.to_dense(b.grad)
80
- expected_grad = b_dense.grad
81
- self.assertEqual(grad.dim(), 2)
82
- self.assertEqual(expected_grad.size(), grad.size())
83
- self.assertTrue(allclose(grad, expected_grad))
84
 
85
  if __name__ == '__main__':
86
  unittest.main()
 
1
  import unittest
2
  import itertools
3
  import torch
4
+ # from absl.testing import parameterized
5
 
6
  import stk
7
  from stk.ops.linear_ops_test import allclose, _dense_and_sparse
 
47
  return (dense.requires_grad_(True),
48
  sparse.requires_grad_(True))
49
 
50
+ # @parameterized.parameters(_ELTWISE_OP_TESTS)
51
+ # class EltwiseOpsTest(parameterized.TestCase):
52
+ #
53
+ # def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
54
+ #
55
+ # a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
56
+ # b_dense, b = _dense_and_sparse_like(a)
57
+ #
58
+ # out = stk.ops.mul(a, b)
59
+ # expected_out = torch.mul(a_dense, b_dense)
60
+ #
61
+ # # Compute the gradients w.r.t. the inputs.
62
+ # expected_out.sum().backward()
63
+ # stk.ops.sum(out).backward()
64
+ #
65
+ # # Validate the results.
66
+ # out = stk.ops.to_dense(out)
67
+ # self.assertEqual(out.dim(), 2)
68
+ # self.assertEqual(expected_out.size(), out.size())
69
+ # self.assertTrue(allclose(out, expected_out))
70
+ #
71
+ # # LHS gradient.
72
+ # grad = stk.ops.to_dense(a.grad)
73
+ # expected_grad = a_dense.grad
74
+ # self.assertEqual(grad.dim(), 2)
75
+ # self.assertEqual(expected_grad.size(), grad.size())
76
+ # self.assertTrue(allclose(grad, expected_grad))
77
+ #
78
+ # # RHS gradient.
79
+ # grad = stk.ops.to_dense(b.grad)
80
+ # expected_grad = b_dense.grad
81
+ # self.assertEqual(grad.dim(), 2)
82
+ # self.assertEqual(expected_grad.size(), grad.size())
83
+ # self.assertTrue(allclose(grad, expected_grad))
84
 
85
  if __name__ == '__main__':
86
  unittest.main()
build/torch210-cxx11-cu126-x86_64-linux/stk/ops/linear_ops_test.py CHANGED
@@ -2,7 +2,7 @@ import unittest
2
  import itertools
3
  import numpy as np
4
  import torch
5
- from absl.testing import parameterized
6
 
7
  import stk
8
 
@@ -96,121 +96,121 @@ def _mask(x, mask):
96
  return x * mask
97
 
98
 
99
- @parameterized.parameters(*_LINEAR_OP_TESTS)
100
- class LinearOpsTest(parameterized.TestCase):
101
-
102
- def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
103
- # Construct the operands.
104
- a_shape = (k, m) if trans_a else (m, k)
105
- a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
106
- b_shape = (n, k) if trans_b else (k, n)
107
- b, bcp = _dense_2x(*b_shape, dtype)
108
-
109
- # Execute the matmul.
110
- out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
111
- expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
112
-
113
- # Compute the gradients w.r.t. the inputs.
114
- expected_out.sum().backward()
115
- out.sum().backward()
116
-
117
- # Validate the results.
118
- self.assertEqual(out.dim(), 2)
119
- self.assertEqual(expected_out.size()[0], out.size()[0])
120
- self.assertEqual(expected_out.size()[1], out.size()[1])
121
- self.assertTrue(allclose(out, expected_out))
122
-
123
- # LHS gradient.
124
- grad = stk.ops.to_dense(a.grad)
125
- expected_grad = _mask(a_dense.grad, a.grad)
126
- self.assertEqual(grad.dim(), 2)
127
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
128
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
129
- self.assertTrue(allclose(grad, expected_grad))
130
-
131
- # RHS gradient.
132
- grad = b.grad
133
- expected_grad = bcp.grad
134
- self.assertEqual(grad.dim(), 2)
135
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
136
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
137
- self.assertTrue(allclose(grad, expected_grad))
138
-
139
- def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
140
- # Construct the operands.
141
- a_shape = (k, m) if trans_a else (m, k)
142
- a, acp = _dense_2x(*a_shape, dtype)
143
- b_shape = (n, k) if trans_b else (k, n)
144
- b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
145
-
146
- # Execute the matmul.
147
- out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
148
- expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
149
-
150
- # Compute the gradients w.r.t. the inputs.
151
- expected_out.sum().backward()
152
- out.sum().backward()
153
-
154
- # Validate the results.
155
- self.assertEqual(out.dim(), 2)
156
- self.assertEqual(expected_out.size()[0], out.size()[0])
157
- self.assertEqual(expected_out.size()[1], out.size()[1])
158
- self.assertTrue(allclose(out, expected_out))
159
-
160
- # LHS gradient.
161
- grad = a.grad
162
- expected_grad = acp.grad
163
- self.assertEqual(grad.dim(), 2)
164
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
165
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
166
- self.assertTrue(allclose(grad, expected_grad))
167
-
168
- # RHS gradient.
169
- grad = stk.ops.to_dense(b.grad)
170
- expected_grad = _mask(b_dense.grad, b.grad)
171
- self.assertEqual(grad.dim(), 2)
172
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
173
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
174
- self.assertTrue(allclose(grad, expected_grad))
175
-
176
- def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
177
- # Construct the operands.
178
- a_shape = (k, m) if trans_a else (m, k)
179
- a, acp = _dense_2x(*a_shape, dtype)
180
- b_shape = (n, k) if trans_b else (k, n)
181
- b, bcp = _dense_2x(*b_shape, dtype)
182
- _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
183
-
184
- # Execute the matmul.
185
- out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
186
- expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
187
-
188
- # Compute the gradients w.r.t. the inputs.
189
- expected_out.sum().backward()
190
- stk.ops.sum(out).backward()
191
-
192
- # Validate the results.
193
- out = stk.ops.to_dense(out)
194
- self.assertEqual(out.dim(), 2)
195
- self.assertEqual(expected_out.size()[0], out.size()[0])
196
- self.assertEqual(expected_out.size()[1], out.size()[1])
197
- self.assertTrue(allclose(out, expected_out))
198
-
199
- # LHS gradient.
200
- grad = a.grad
201
- expected_grad = acp.grad
202
- self.assertEqual(grad.dim(), 2)
203
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
204
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
205
- self.assertTrue(allclose(grad, expected_grad))
206
-
207
- # RHS gradient.
208
- grad = b.grad
209
- expected_grad = bcp.grad
210
- self.assertEqual(grad.dim(), 2)
211
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
212
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
213
- self.assertTrue(allclose(grad, expected_grad))
214
 
215
  if __name__ == '__main__':
216
  unittest.main()
 
2
  import itertools
3
  import numpy as np
4
  import torch
5
+ # from absl.testing import parameterized
6
 
7
  import stk
8
 
 
96
  return x * mask
97
 
98
 
99
+ # @parameterized.parameters(*_LINEAR_OP_TESTS)
100
+ # class LinearOpsTest(parameterized.TestCase):
101
+ #
102
+ # def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
103
+ # # Construct the operands.
104
+ # a_shape = (k, m) if trans_a else (m, k)
105
+ # a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
106
+ # b_shape = (n, k) if trans_b else (k, n)
107
+ # b, bcp = _dense_2x(*b_shape, dtype)
108
+ #
109
+ # # Execute the matmul.
110
+ # out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
111
+ # expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
112
+ #
113
+ # # Compute the gradients w.r.t. the inputs.
114
+ # expected_out.sum().backward()
115
+ # out.sum().backward()
116
+ #
117
+ # # Validate the results.
118
+ # self.assertEqual(out.dim(), 2)
119
+ # self.assertEqual(expected_out.size()[0], out.size()[0])
120
+ # self.assertEqual(expected_out.size()[1], out.size()[1])
121
+ # self.assertTrue(allclose(out, expected_out))
122
+ #
123
+ # # LHS gradient.
124
+ # grad = stk.ops.to_dense(a.grad)
125
+ # expected_grad = _mask(a_dense.grad, a.grad)
126
+ # self.assertEqual(grad.dim(), 2)
127
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
128
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
129
+ # self.assertTrue(allclose(grad, expected_grad))
130
+ #
131
+ # # RHS gradient.
132
+ # grad = b.grad
133
+ # expected_grad = bcp.grad
134
+ # self.assertEqual(grad.dim(), 2)
135
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
136
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
137
+ # self.assertTrue(allclose(grad, expected_grad))
138
+ #
139
+ # def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
140
+ # # Construct the operands.
141
+ # a_shape = (k, m) if trans_a else (m, k)
142
+ # a, acp = _dense_2x(*a_shape, dtype)
143
+ # b_shape = (n, k) if trans_b else (k, n)
144
+ # b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
145
+ #
146
+ # # Execute the matmul.
147
+ # out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
148
+ # expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
149
+ #
150
+ # # Compute the gradients w.r.t. the inputs.
151
+ # expected_out.sum().backward()
152
+ # out.sum().backward()
153
+ #
154
+ # # Validate the results.
155
+ # self.assertEqual(out.dim(), 2)
156
+ # self.assertEqual(expected_out.size()[0], out.size()[0])
157
+ # self.assertEqual(expected_out.size()[1], out.size()[1])
158
+ # self.assertTrue(allclose(out, expected_out))
159
+ #
160
+ # # LHS gradient.
161
+ # grad = a.grad
162
+ # expected_grad = acp.grad
163
+ # self.assertEqual(grad.dim(), 2)
164
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
165
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
166
+ # self.assertTrue(allclose(grad, expected_grad))
167
+ #
168
+ # # RHS gradient.
169
+ # grad = stk.ops.to_dense(b.grad)
170
+ # expected_grad = _mask(b_dense.grad, b.grad)
171
+ # self.assertEqual(grad.dim(), 2)
172
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
173
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
174
+ # self.assertTrue(allclose(grad, expected_grad))
175
+ #
176
+ # def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
177
+ # # Construct the operands.
178
+ # a_shape = (k, m) if trans_a else (m, k)
179
+ # a, acp = _dense_2x(*a_shape, dtype)
180
+ # b_shape = (n, k) if trans_b else (k, n)
181
+ # b, bcp = _dense_2x(*b_shape, dtype)
182
+ # _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
183
+ #
184
+ # # Execute the matmul.
185
+ # out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
186
+ # expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
187
+ #
188
+ # # Compute the gradients w.r.t. the inputs.
189
+ # expected_out.sum().backward()
190
+ # stk.ops.sum(out).backward()
191
+ #
192
+ # # Validate the results.
193
+ # out = stk.ops.to_dense(out)
194
+ # self.assertEqual(out.dim(), 2)
195
+ # self.assertEqual(expected_out.size()[0], out.size()[0])
196
+ # self.assertEqual(expected_out.size()[1], out.size()[1])
197
+ # self.assertTrue(allclose(out, expected_out))
198
+ #
199
+ # # LHS gradient.
200
+ # grad = a.grad
201
+ # expected_grad = acp.grad
202
+ # self.assertEqual(grad.dim(), 2)
203
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
204
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
205
+ # self.assertTrue(allclose(grad, expected_grad))
206
+ #
207
+ # # RHS gradient.
208
+ # grad = b.grad
209
+ # expected_grad = bcp.grad
210
+ # self.assertEqual(grad.dim(), 2)
211
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
212
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
213
+ # self.assertTrue(allclose(grad, expected_grad))
214
 
215
  if __name__ == '__main__':
216
  unittest.main()
build/torch210-cxx11-cu126-x86_64-linux/stk/ops/matrix_ops_test.py CHANGED
@@ -1,61 +1,61 @@
1
  import unittest
2
 
3
- from absl.testing import parameterized
4
  import stk
5
  import torch
6
 
7
 
8
- @parameterized.parameters(
9
- (8, 16, 0.0, 1),
10
- (8, 16, 0.5, 1),
11
- (8, 16, .95, 1),
12
- (16, 8, 0.0, 1),
13
- (16, 8, 0.5, 1),
14
- (16, 8, .95, 1),
15
- (8, 16, 0.0, 8),
16
- (8, 16, 0.5, 8),
17
- (8, 16, 1.0, 8),
18
- (16, 8, 0.0, 8),
19
- (16, 8, 0.5, 8),
20
- (16, 8, 1.0, 8),
21
- (128, 256, 0.5, 16),
22
- (256, 128, 0.75, 32),
23
- (512, 512, .875, 128))
24
- class MatrixOpsTest(parameterized.TestCase):
25
-
26
- def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
27
- mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
28
- x = (torch.randn(rows, cols) * mask).type(torch.float16)
29
-
30
- # Convert the matrix to sparse format.
31
- sparse_x = stk.ops.to_sparse(x, blocking)
32
-
33
- # Validate the matrix.
34
- sparse_x.validate()
35
-
36
- # Validate the shape.
37
- self.assertEqual(sparse_x.dim(), 2)
38
- self.assertEqual(sparse_x.size()[0], rows)
39
- self.assertEqual(sparse_x.size()[1], cols)
40
-
41
- # Validate the sparsity.
42
- numblocks = rows // blocking * cols // blocking
43
- nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
44
- self.assertEqual(sparse_x.nnz, nnz)
45
-
46
- # Convert back to dense format.
47
- dense_x = stk.ops.to_dense(sparse_x)
48
-
49
- # Validate the shape.
50
- self.assertEqual(dense_x.dim(), 2)
51
- self.assertEqual(dense_x.size()[0], rows)
52
- self.assertEqual(dense_x.size()[1], cols)
53
-
54
- # Validate the sparsity
55
- self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
56
-
57
- # Validate the output.
58
- self.assertTrue(torch.all(torch.eq(x, dense_x)))
59
 
60
 
61
  if __name__ == '__main__':
 
1
  import unittest
2
 
3
+ # from absl.testing import parameterized
4
  import stk
5
  import torch
6
 
7
 
8
+ # @parameterized.parameters(
9
+ # (8, 16, 0.0, 1),
10
+ # (8, 16, 0.5, 1),
11
+ # (8, 16, .95, 1),
12
+ # (16, 8, 0.0, 1),
13
+ # (16, 8, 0.5, 1),
14
+ # (16, 8, .95, 1),
15
+ # (8, 16, 0.0, 8),
16
+ # (8, 16, 0.5, 8),
17
+ # (8, 16, 1.0, 8),
18
+ # (16, 8, 0.0, 8),
19
+ # (16, 8, 0.5, 8),
20
+ # (16, 8, 1.0, 8),
21
+ # (128, 256, 0.5, 16),
22
+ # (256, 128, 0.75, 32),
23
+ # (512, 512, .875, 128))
24
+ # class MatrixOpsTest(parameterized.TestCase):
25
+ #
26
+ # def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
27
+ # mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
28
+ # x = (torch.randn(rows, cols) * mask).type(torch.float16)
29
+ #
30
+ # # Convert the matrix to sparse format.
31
+ # sparse_x = stk.ops.to_sparse(x, blocking)
32
+ #
33
+ # # Validate the matrix.
34
+ # sparse_x.validate()
35
+ #
36
+ # # Validate the shape.
37
+ # self.assertEqual(sparse_x.dim(), 2)
38
+ # self.assertEqual(sparse_x.size()[0], rows)
39
+ # self.assertEqual(sparse_x.size()[1], cols)
40
+ #
41
+ # # Validate the sparsity.
42
+ # numblocks = rows // blocking * cols // blocking
43
+ # nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
44
+ # self.assertEqual(sparse_x.nnz, nnz)
45
+ #
46
+ # # Convert back to dense format.
47
+ # dense_x = stk.ops.to_dense(sparse_x)
48
+ #
49
+ # # Validate the shape.
50
+ # self.assertEqual(dense_x.dim(), 2)
51
+ # self.assertEqual(dense_x.size()[0], rows)
52
+ # self.assertEqual(dense_x.size()[1], cols)
53
+ #
54
+ # # Validate the sparsity
55
+ # self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
56
+ #
57
+ # # Validate the output.
58
+ # self.assertTrue(torch.all(torch.eq(x, dense_x)))
59
 
60
 
61
  if __name__ == '__main__':
build/torch210-cxx11-cu126-x86_64-linux/stk/random/random_ops_test.py CHANGED
@@ -1,72 +1,72 @@
1
  import unittest
2
 
3
- from absl.testing import parameterized
4
  from . import random
5
  import torch
6
 
7
 
8
- @parameterized.parameters(
9
- (8, 16, 0.0, 1),
10
- (8, 16, 0.5, 1),
11
- (8, 16, .95, 1),
12
- (16, 8, 0.0, 1),
13
- (16, 8, 0.5, 1),
14
- (16, 8, .95, 1),
15
- (8, 16, 0.0, 8),
16
- (8, 16, 0.5, 8),
17
- (8, 16, 1.0, 8),
18
- (16, 8, 0.0, 8),
19
- (16, 8, 0.5, 8),
20
- (16, 8, 1.0, 8),
21
- (128, 256, 0.5, 16),
22
- (256, 128, 0.75, 32),
23
- (512, 512, .875, 128))
24
- class RandomOpsTest(parameterized.TestCase):
25
-
26
- def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
27
- mask = random.dense_mask(
28
- rows, cols, sparsity, blocking)
29
-
30
- # Validate the shape.
31
- self.assertEqual(mask.dim(), 2)
32
- self.assertEqual(mask.size()[0], rows)
33
- self.assertEqual(mask.size()[1], cols)
34
-
35
- # Validate the sparsity
36
- numblocks = rows // blocking * cols // blocking
37
- nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
38
- self.assertEqual(
39
- torch.count_nonzero(mask).item(),
40
- nnz)
41
-
42
- # Check values are zero or one.
43
- self.assertTrue(
44
- torch.all(torch.logical_or(
45
- torch.eq(mask, 0),
46
- torch.eq(mask, 1))))
47
-
48
- def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
49
- mask = random.mask(
50
- rows, cols, sparsity, blocking)
51
-
52
- # Validate the matrix.
53
- mask.validate()
54
-
55
- # Validate the shape.
56
- self.assertEqual(mask.dim(), 2)
57
- self.assertEqual(mask.size()[0], rows)
58
- self.assertEqual(mask.size()[1], cols)
59
-
60
- # Validate the sparsity.
61
- numblocks = rows // blocking * cols // blocking
62
- nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
63
- self.assertEqual(mask.nnz, nnz)
64
-
65
- # Check values are zero or one.
66
- self.assertTrue(
67
- torch.all(torch.logical_or(
68
- torch.eq(mask.data, 0),
69
- torch.eq(mask.data, 1))))
70
 
71
 
72
  if __name__ == '__main__':
 
1
  import unittest
2
 
3
+ # from absl.testing import parameterized
4
  from . import random
5
  import torch
6
 
7
 
8
+ # @parameterized.parameters(
9
+ # (8, 16, 0.0, 1),
10
+ # (8, 16, 0.5, 1),
11
+ # (8, 16, .95, 1),
12
+ # (16, 8, 0.0, 1),
13
+ # (16, 8, 0.5, 1),
14
+ # (16, 8, .95, 1),
15
+ # (8, 16, 0.0, 8),
16
+ # (8, 16, 0.5, 8),
17
+ # (8, 16, 1.0, 8),
18
+ # (16, 8, 0.0, 8),
19
+ # (16, 8, 0.5, 8),
20
+ # (16, 8, 1.0, 8),
21
+ # (128, 256, 0.5, 16),
22
+ # (256, 128, 0.75, 32),
23
+ # (512, 512, .875, 128))
24
+ # class RandomOpsTest(parameterized.TestCase):
25
+ #
26
+ # def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
27
+ # mask = random.dense_mask(
28
+ # rows, cols, sparsity, blocking)
29
+ #
30
+ # # Validate the shape.
31
+ # self.assertEqual(mask.dim(), 2)
32
+ # self.assertEqual(mask.size()[0], rows)
33
+ # self.assertEqual(mask.size()[1], cols)
34
+ #
35
+ # # Validate the sparsity
36
+ # numblocks = rows // blocking * cols // blocking
37
+ # nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
38
+ # self.assertEqual(
39
+ # torch.count_nonzero(mask).item(),
40
+ # nnz)
41
+ #
42
+ # # Check values are zero or one.
43
+ # self.assertTrue(
44
+ # torch.all(torch.logical_or(
45
+ # torch.eq(mask, 0),
46
+ # torch.eq(mask, 1))))
47
+ #
48
+ # def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
49
+ # mask = random.mask(
50
+ # rows, cols, sparsity, blocking)
51
+ #
52
+ # # Validate the matrix.
53
+ # mask.validate()
54
+ #
55
+ # # Validate the shape.
56
+ # self.assertEqual(mask.dim(), 2)
57
+ # self.assertEqual(mask.size()[0], rows)
58
+ # self.assertEqual(mask.size()[1], cols)
59
+ #
60
+ # # Validate the sparsity.
61
+ # numblocks = rows // blocking * cols // blocking
62
+ # nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
63
+ # self.assertEqual(mask.nnz, nnz)
64
+ #
65
+ # # Check values are zero or one.
66
+ # self.assertTrue(
67
+ # torch.all(torch.logical_or(
68
+ # torch.eq(mask.data, 0),
69
+ # torch.eq(mask.data, 1))))
70
 
71
 
72
  if __name__ == '__main__':
build/torch210-cxx11-cu128-x86_64-linux/{_megablocks_cuda_6e04dec.abi3.so → _megablocks_cuda_a45325d.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e66fd44576448dc82e7392db0c935cd8654bfcb51db51ddc044e1c33bc82c60
3
  size 21009984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8b110fed233d0db0bef3df539cab1487191a578b89bae5b3fba3f39262f827f
3
  size 21009984
build/torch210-cxx11-cu128-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _megablocks_cuda_6e04dec
3
- ops = torch.ops._megablocks_cuda_6e04dec
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_megablocks_cuda_6e04dec::{op_name}"
 
1
  import torch
2
+ from . import _megablocks_cuda_a45325d
3
+ ops = torch.ops._megablocks_cuda_a45325d
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_megablocks_cuda_a45325d::{op_name}"
build/torch210-cxx11-cu128-x86_64-linux/megablocks/__init__.py CHANGED
@@ -1,10 +1,10 @@
1
  import ctypes
 
2
  import sys
3
-
4
- import importlib
5
  from pathlib import Path
6
  from types import ModuleType
7
 
 
8
  def _import_from_path(file_path: Path) -> ModuleType:
9
  # We cannot use the module name as-is, after adding it to `sys.modules`,
10
  # it would also be used for other imports. So, we make a module name that
 
1
  import ctypes
2
+ import importlib.util
3
  import sys
 
 
4
  from pathlib import Path
5
  from types import ModuleType
6
 
7
+
8
  def _import_from_path(file_path: Path) -> ModuleType:
9
  # We cannot use the module name as-is, after adding it to `sys.modules`,
10
  # it would also be used for other imports. So, we make a module name that
build/torch210-cxx11-cu128-x86_64-linux/ops/histogram_benchmark.py CHANGED
@@ -5,7 +5,7 @@ import unittest
5
 
6
  import numpy as np
7
  import torch
8
- from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
@@ -47,31 +47,31 @@ def log_benchmark(arguments, mean_t, std_t):
47
  print('=' * 60)
48
 
49
 
50
- class HistogramBenchmark(parameterized.TestCase):
51
-
52
- @parameterized.parameters(*_HISTOGRAM_TESTS)
53
- def testHistogram(self, n, dtype, max_val):
54
- x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
55
-
56
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
57
- arguments = {
58
- 'n': n,
59
- 'dtype': dtype,
60
- 'max_val': max_val,
61
- }
62
- log_benchmark(arguments, mean_t, std_t)
63
-
64
- @parameterized.parameters(*_HISTOGRAM_TESTS)
65
- def testTorchHistogram(self, n, dtype, max_val):
66
- x = torch.randint(0, 128, (n,)).cuda().to(dtype)
67
-
68
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
69
- arguments = {
70
- 'n': n,
71
- 'dtype': dtype,
72
- 'max_val': max_val,
73
- }
74
- log_benchmark(arguments, mean_t, std_t)
75
 
76
 
77
  if __name__ == '__main__':
 
5
 
6
  import numpy as np
7
  import torch
8
+ # from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
 
47
  print('=' * 60)
48
 
49
 
50
+ # class HistogramBenchmark(parameterized.TestCase):
51
+ #
52
+ # @parameterized.parameters(*_HISTOGRAM_TESTS)
53
+ # def testHistogram(self, n, dtype, max_val):
54
+ # x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
55
+ #
56
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
57
+ # arguments = {
58
+ # 'n': n,
59
+ # 'dtype': dtype,
60
+ # 'max_val': max_val,
61
+ # }
62
+ # log_benchmark(arguments, mean_t, std_t)
63
+ #
64
+ # @parameterized.parameters(*_HISTOGRAM_TESTS)
65
+ # def testTorchHistogram(self, n, dtype, max_val):
66
+ # x = torch.randint(0, 128, (n,)).cuda().to(dtype)
67
+ #
68
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
69
+ # arguments = {
70
+ # 'n': n,
71
+ # 'dtype': dtype,
72
+ # 'max_val': max_val,
73
+ # }
74
+ # log_benchmark(arguments, mean_t, std_t)
75
 
76
 
77
  if __name__ == '__main__':
build/torch210-cxx11-cu128-x86_64-linux/ops/matmul_benchmark.py CHANGED
@@ -17,7 +17,7 @@ import unittest
17
  from .. import stk
18
 
19
  import torch
20
- from absl.testing import parameterized
21
 
22
  from .. import benchmark_util, ops
23
 
@@ -48,367 +48,367 @@ def log_benchmark(name, arguments, time, std, flops):
48
  print('=' * 60)
49
 
50
 
51
- class MatmulBenchmark(parameterized.TestCase):
52
-
53
- def build_sparse_matrix(self, x, padded_bins, fhs, ne):
54
- blocking = 128
55
- padded_tokens, _ = x.size()
56
- assert padded_tokens % blocking == 0
57
- assert fhs % blocking == 0
58
-
59
- # Offsets for the sparse matrix. All rows have the
60
- # same number of nonzero blocks dictated by the
61
- # dimensionality of a single expert.
62
- block_rows = padded_tokens // blocking
63
- blocks_per_row = fhs // blocking
64
- offsets = torch.arange(
65
- 0,
66
- block_rows * blocks_per_row + 1,
67
- blocks_per_row,
68
- dtype=torch.int32,
69
- device=x.device,
70
- )
71
-
72
- # Indices for the sparse matrix. The indices for
73
- # the intermediate matrix are dynamic depending
74
- # on the mapping of tokens to experts.
75
- column_indices = ops.topology(
76
- padded_bins,
77
- blocking,
78
- block_rows,
79
- blocks_per_row,
80
- )
81
- data = torch.empty(
82
- column_indices.numel(),
83
- blocking,
84
- blocking,
85
- dtype=torch.float16,
86
- device=x.device,
87
- )
88
- shape = (padded_tokens, fhs * ne)
89
- row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
90
- return stk.Matrix(shape, data, row_indices, column_indices, offsets)
91
-
92
- def build_input_matrix(self, sl, hs, ne):
93
- x = torch.randn((sl, hs)).cuda().half()
94
-
95
- # Assign tokens to experts uniformly.
96
- top_expert = torch.arange(0, sl).cuda().int() % ne
97
-
98
- bin_ids, indices = ops.sort(top_expert)
99
- tokens_per_expert = ops.histogram(top_expert, ne)
100
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
101
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
102
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
103
- out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
104
- return out, padded_bins
105
-
106
- def build_weight_matrix(self, ne, hs, fhs):
107
- return torch.randn((hs, ne * fhs)).cuda().half()
108
-
109
- @parameterized.parameters(*_MATMUL_TESTS)
110
- def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
111
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
112
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
113
- topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
114
- w = transpose_view(w)
115
-
116
- def benchmark():
117
- return stk.ops.sdd(x, w, topo)
118
-
119
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
- arguments = {
121
- 'sequence_length': sl,
122
- 'hidden_size': hs,
123
- 'ffn_hidden_size': fhs,
124
- 'num_experts': ne,
125
- }
126
- log_benchmark(
127
- '0::Fwd::SDD::NT',
128
- arguments,
129
- mean_t,
130
- std_t,
131
- x.numel() * fhs * 2,
132
- )
133
-
134
- @parameterized.parameters(*_MATMUL_TESTS)
135
- def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
136
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
137
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
138
- topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
139
-
140
- def benchmark():
141
- return stk.ops.dsd(topo, w)
142
-
143
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
144
- arguments = {
145
- 'sequence_length': sl,
146
- 'hidden_size': hs,
147
- 'ffn_hidden_size': fhs,
148
- 'num_experts': ne,
149
- }
150
- log_benchmark(
151
- '0::GradX::DSD::NN',
152
- arguments,
153
- mean_t,
154
- std_t,
155
- x.numel() * fhs * 2,
156
- )
157
-
158
- @parameterized.parameters(*_MATMUL_TESTS)
159
- def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
160
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
161
- topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
162
- topo = topo.t()
163
-
164
- def benchmark():
165
- return stk.ops.dsd(topo, x)
166
-
167
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
168
- arguments = {
169
- 'sequence_length': sl,
170
- 'hidden_size': hs,
171
- 'ffn_hidden_size': fhs,
172
- 'num_experts': ne,
173
- }
174
- log_benchmark(
175
- '0::GradW::DSD::TN',
176
- arguments,
177
- mean_t,
178
- std_t,
179
- x.numel() * fhs * 2,
180
- )
181
-
182
- @parameterized.parameters(*_MATMUL_TESTS)
183
- def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
184
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
185
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
186
- x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
187
-
188
- def benchmark():
189
- return stk.ops.dsd(x, w)
190
-
191
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
192
- arguments = {
193
- 'sequence_length': sl,
194
- 'hidden_size': hs,
195
- 'ffn_hidden_size': fhs,
196
- 'num_experts': ne,
197
- }
198
- log_benchmark(
199
- '1::Fwd::DSD::NN',
200
- arguments,
201
- mean_t,
202
- std_t,
203
- x.nnz * hs * 2,
204
- )
205
-
206
- @parameterized.parameters(*_MATMUL_TESTS)
207
- def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
208
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
209
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
210
- x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
211
- out = stk.ops.dsd(x, w)
212
- w = transpose_view(w)
213
-
214
- def benchmark():
215
- return stk.ops.sdd(out, w, x)
216
-
217
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
218
- arguments = {
219
- 'sequence_length': sl,
220
- 'hidden_size': hs,
221
- 'ffn_hidden_size': fhs,
222
- 'num_experts': ne,
223
- }
224
- log_benchmark(
225
- '1::GradX::SDD::NT',
226
- arguments,
227
- mean_t,
228
- std_t,
229
- x.nnz * hs * 2,
230
- )
231
-
232
- @parameterized.parameters(*_MATMUL_TESTS)
233
- def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
234
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
235
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
236
- x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
237
- out = stk.ops.dsd(x, w)
238
- x = x.t()
239
-
240
- def benchmark():
241
- return stk.ops.dsd(x, out)
242
-
243
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
244
- arguments = {
245
- 'sequence_length': sl,
246
- 'hidden_size': hs,
247
- 'ffn_hidden_size': fhs,
248
- 'num_experts': ne,
249
- }
250
- log_benchmark(
251
- '1::GradW::DSD::TN',
252
- arguments,
253
- mean_t,
254
- std_t,
255
- x.nnz * hs * 2,
256
- )
257
-
258
- @parameterized.parameters(*_MATMUL_TESTS)
259
- def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
260
- assert (sl % ne) == 0
261
- x = torch.randn((ne, sl // ne, hs)).cuda().half()
262
- w = torch.randn((ne, hs, fhs)).cuda().half()
263
-
264
- w = w.transpose(1, 2).contiguous()
265
- w = w.transpose(1, 2)
266
-
267
- def benchmark():
268
- return torch.bmm(x, w)
269
-
270
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
271
- arguments = {
272
- 'sequence_length': sl,
273
- 'hidden_size': hs,
274
- 'ffn_hidden_size': fhs,
275
- 'num_experts': ne,
276
- }
277
- log_benchmark(
278
- '0::Fwd:DDD::NT',
279
- arguments,
280
- mean_t,
281
- std_t,
282
- x.numel() * fhs * 2,
283
- )
284
-
285
- @parameterized.parameters(*_MATMUL_TESTS)
286
- def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
287
- assert (sl % ne) == 0
288
- x = torch.randn((ne, sl // ne, hs)).cuda().half()
289
- w = torch.randn((ne, hs, fhs)).cuda().half()
290
- out = torch.bmm(x, w)
291
- w = w.transpose(1, 2).contiguous()
292
-
293
- def benchmark():
294
- return torch.bmm(out, w)
295
-
296
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
297
- arguments = {
298
- 'sequence_length': sl,
299
- 'hidden_size': hs,
300
- 'ffn_hidden_size': fhs,
301
- 'num_experts': ne,
302
- }
303
- log_benchmark(
304
- '0:GradX:DDD::NN',
305
- arguments,
306
- mean_t,
307
- std_t,
308
- x.numel() * fhs * 2,
309
- )
310
-
311
- @parameterized.parameters(*_MATMUL_TESTS)
312
- def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
313
- assert (sl % ne) == 0
314
- x = torch.randn((ne, sl // ne, hs)).cuda().half()
315
- w = torch.randn((ne, hs, fhs)).cuda().half()
316
- out = torch.bmm(x, w)
317
- out = out.transpose(1, 2)
318
-
319
- def benchmark():
320
- return torch.bmm(out, x)
321
-
322
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
323
- arguments = {
324
- 'sequence_length': sl,
325
- 'hidden_size': hs,
326
- 'ffn_hidden_size': fhs,
327
- 'num_experts': ne,
328
- }
329
- log_benchmark(
330
- '0:GradW:DDD::TN',
331
- arguments,
332
- mean_t,
333
- std_t,
334
- x.numel() * fhs * 2,
335
- )
336
-
337
- @parameterized.parameters(*_MATMUL_TESTS)
338
- def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
339
- assert (sl % ne) == 0
340
- x = torch.randn((ne, sl // ne, fhs)).cuda().half()
341
- w = torch.randn((ne, fhs, hs)).cuda().half()
342
-
343
- def benchmark():
344
- return torch.bmm(x, w)
345
-
346
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
347
- arguments = {
348
- 'sequence_length': sl,
349
- 'hidden_size': hs,
350
- 'ffn_hidden_size': fhs,
351
- 'num_experts': ne,
352
- }
353
- log_benchmark(
354
- '1::Fwd::DDD::NN',
355
- arguments,
356
- mean_t,
357
- std_t,
358
- x.numel() * hs * 2,
359
- )
360
-
361
- @parameterized.parameters(*_MATMUL_TESTS)
362
- def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
363
- assert (sl % ne) == 0
364
- x = torch.randn((ne, sl // ne, fhs)).cuda().half()
365
- w = torch.randn((ne, fhs, hs)).cuda().half()
366
- out = torch.bmm(x, w)
367
- w = torch.transpose(w, 1, 2)
368
-
369
- def benchmark():
370
- return torch.bmm(out, w)
371
-
372
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
373
- arguments = {
374
- 'sequence_length': sl,
375
- 'hidden_size': hs,
376
- 'ffn_hidden_size': fhs,
377
- 'num_experts': ne,
378
- }
379
- log_benchmark(
380
- '1::GradX::DDD::NT',
381
- arguments,
382
- mean_t,
383
- std_t,
384
- x.numel() * hs * 2,
385
- )
386
-
387
- @parameterized.parameters(*_MATMUL_TESTS)
388
- def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
389
- assert (sl % ne) == 0
390
- x = torch.randn((ne, sl // ne, fhs)).cuda().half()
391
- w = torch.randn((ne, fhs, hs)).cuda().half()
392
- out = torch.bmm(x, w)
393
- x = torch.transpose(x, 1, 2)
394
-
395
- def benchmark():
396
- return torch.bmm(x, out)
397
-
398
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
399
- arguments = {
400
- 'sequence_length': sl,
401
- 'hidden_size': hs,
402
- 'ffn_hidden_size': fhs,
403
- 'num_experts': ne,
404
- }
405
- log_benchmark(
406
- '1::GradW::DDD::TN',
407
- arguments,
408
- mean_t,
409
- std_t,
410
- x.numel() * hs * 2,
411
- )
412
 
413
 
414
  if __name__ == '__main__':
 
17
  from .. import stk
18
 
19
  import torch
20
+ # from absl.testing import parameterized
21
 
22
  from .. import benchmark_util, ops
23
 
 
48
  print('=' * 60)
49
 
50
 
51
+ # class MatmulBenchmark(parameterized.TestCase):
52
+ #
53
+ # def build_sparse_matrix(self, x, padded_bins, fhs, ne):
54
+ # blocking = 128
55
+ # padded_tokens, _ = x.size()
56
+ # assert padded_tokens % blocking == 0
57
+ # assert fhs % blocking == 0
58
+ #
59
+ # # Offsets for the sparse matrix. All rows have the
60
+ # # same number of nonzero blocks dictated by the
61
+ # # dimensionality of a single expert.
62
+ # block_rows = padded_tokens // blocking
63
+ # blocks_per_row = fhs // blocking
64
+ # offsets = torch.arange(
65
+ # 0,
66
+ # block_rows * blocks_per_row + 1,
67
+ # blocks_per_row,
68
+ # dtype=torch.int32,
69
+ # device=x.device,
70
+ # )
71
+ #
72
+ # # Indices for the sparse matrix. The indices for
73
+ # # the intermediate matrix are dynamic depending
74
+ # # on the mapping of tokens to experts.
75
+ # column_indices = ops.topology(
76
+ # padded_bins,
77
+ # blocking,
78
+ # block_rows,
79
+ # blocks_per_row,
80
+ # )
81
+ # data = torch.empty(
82
+ # column_indices.numel(),
83
+ # blocking,
84
+ # blocking,
85
+ # dtype=torch.float16,
86
+ # device=x.device,
87
+ # )
88
+ # shape = (padded_tokens, fhs * ne)
89
+ # row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
90
+ # return stk.Matrix(shape, data, row_indices, column_indices, offsets)
91
+ #
92
+ # def build_input_matrix(self, sl, hs, ne):
93
+ # x = torch.randn((sl, hs)).cuda().half()
94
+ #
95
+ # # Assign tokens to experts uniformly.
96
+ # top_expert = torch.arange(0, sl).cuda().int() % ne
97
+ #
98
+ # bin_ids, indices = ops.sort(top_expert)
99
+ # tokens_per_expert = ops.histogram(top_expert, ne)
100
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
101
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
102
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
103
+ # out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
104
+ # return out, padded_bins
105
+ #
106
+ # def build_weight_matrix(self, ne, hs, fhs):
107
+ # return torch.randn((hs, ne * fhs)).cuda().half()
108
+ #
109
+ # @parameterized.parameters(*_MATMUL_TESTS)
110
+ # def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
111
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
112
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
113
+ # topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
114
+ # w = transpose_view(w)
115
+ #
116
+ # def benchmark():
117
+ # return stk.ops.sdd(x, w, topo)
118
+ #
119
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
+ # arguments = {
121
+ # 'sequence_length': sl,
122
+ # 'hidden_size': hs,
123
+ # 'ffn_hidden_size': fhs,
124
+ # 'num_experts': ne,
125
+ # }
126
+ # log_benchmark(
127
+ # '0::Fwd::SDD::NT',
128
+ # arguments,
129
+ # mean_t,
130
+ # std_t,
131
+ # x.numel() * fhs * 2,
132
+ # )
133
+ #
134
+ # @parameterized.parameters(*_MATMUL_TESTS)
135
+ # def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
136
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
137
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
138
+ # topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
139
+ #
140
+ # def benchmark():
141
+ # return stk.ops.dsd(topo, w)
142
+ #
143
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
144
+ # arguments = {
145
+ # 'sequence_length': sl,
146
+ # 'hidden_size': hs,
147
+ # 'ffn_hidden_size': fhs,
148
+ # 'num_experts': ne,
149
+ # }
150
+ # log_benchmark(
151
+ # '0::GradX::DSD::NN',
152
+ # arguments,
153
+ # mean_t,
154
+ # std_t,
155
+ # x.numel() * fhs * 2,
156
+ # )
157
+ #
158
+ # @parameterized.parameters(*_MATMUL_TESTS)
159
+ # def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
160
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
161
+ # topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
162
+ # topo = topo.t()
163
+ #
164
+ # def benchmark():
165
+ # return stk.ops.dsd(topo, x)
166
+ #
167
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
168
+ # arguments = {
169
+ # 'sequence_length': sl,
170
+ # 'hidden_size': hs,
171
+ # 'ffn_hidden_size': fhs,
172
+ # 'num_experts': ne,
173
+ # }
174
+ # log_benchmark(
175
+ # '0::GradW::DSD::TN',
176
+ # arguments,
177
+ # mean_t,
178
+ # std_t,
179
+ # x.numel() * fhs * 2,
180
+ # )
181
+ #
182
+ # @parameterized.parameters(*_MATMUL_TESTS)
183
+ # def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
184
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
185
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
186
+ # x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
187
+ #
188
+ # def benchmark():
189
+ # return stk.ops.dsd(x, w)
190
+ #
191
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
192
+ # arguments = {
193
+ # 'sequence_length': sl,
194
+ # 'hidden_size': hs,
195
+ # 'ffn_hidden_size': fhs,
196
+ # 'num_experts': ne,
197
+ # }
198
+ # log_benchmark(
199
+ # '1::Fwd::DSD::NN',
200
+ # arguments,
201
+ # mean_t,
202
+ # std_t,
203
+ # x.nnz * hs * 2,
204
+ # )
205
+ #
206
+ # @parameterized.parameters(*_MATMUL_TESTS)
207
+ # def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
208
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
209
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
210
+ # x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
211
+ # out = stk.ops.dsd(x, w)
212
+ # w = transpose_view(w)
213
+ #
214
+ # def benchmark():
215
+ # return stk.ops.sdd(out, w, x)
216
+ #
217
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
218
+ # arguments = {
219
+ # 'sequence_length': sl,
220
+ # 'hidden_size': hs,
221
+ # 'ffn_hidden_size': fhs,
222
+ # 'num_experts': ne,
223
+ # }
224
+ # log_benchmark(
225
+ # '1::GradX::SDD::NT',
226
+ # arguments,
227
+ # mean_t,
228
+ # std_t,
229
+ # x.nnz * hs * 2,
230
+ # )
231
+ #
232
+ # @parameterized.parameters(*_MATMUL_TESTS)
233
+ # def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
234
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
235
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
236
+ # x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
237
+ # out = stk.ops.dsd(x, w)
238
+ # x = x.t()
239
+ #
240
+ # def benchmark():
241
+ # return stk.ops.dsd(x, out)
242
+ #
243
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
244
+ # arguments = {
245
+ # 'sequence_length': sl,
246
+ # 'hidden_size': hs,
247
+ # 'ffn_hidden_size': fhs,
248
+ # 'num_experts': ne,
249
+ # }
250
+ # log_benchmark(
251
+ # '1::GradW::DSD::TN',
252
+ # arguments,
253
+ # mean_t,
254
+ # std_t,
255
+ # x.nnz * hs * 2,
256
+ # )
257
+ #
258
+ # @parameterized.parameters(*_MATMUL_TESTS)
259
+ # def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
260
+ # assert (sl % ne) == 0
261
+ # x = torch.randn((ne, sl // ne, hs)).cuda().half()
262
+ # w = torch.randn((ne, hs, fhs)).cuda().half()
263
+ #
264
+ # w = w.transpose(1, 2).contiguous()
265
+ # w = w.transpose(1, 2)
266
+ #
267
+ # def benchmark():
268
+ # return torch.bmm(x, w)
269
+ #
270
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
271
+ # arguments = {
272
+ # 'sequence_length': sl,
273
+ # 'hidden_size': hs,
274
+ # 'ffn_hidden_size': fhs,
275
+ # 'num_experts': ne,
276
+ # }
277
+ # log_benchmark(
278
+ # '0::Fwd:DDD::NT',
279
+ # arguments,
280
+ # mean_t,
281
+ # std_t,
282
+ # x.numel() * fhs * 2,
283
+ # )
284
+ #
285
+ # @parameterized.parameters(*_MATMUL_TESTS)
286
+ # def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
287
+ # assert (sl % ne) == 0
288
+ # x = torch.randn((ne, sl // ne, hs)).cuda().half()
289
+ # w = torch.randn((ne, hs, fhs)).cuda().half()
290
+ # out = torch.bmm(x, w)
291
+ # w = w.transpose(1, 2).contiguous()
292
+ #
293
+ # def benchmark():
294
+ # return torch.bmm(out, w)
295
+ #
296
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
297
+ # arguments = {
298
+ # 'sequence_length': sl,
299
+ # 'hidden_size': hs,
300
+ # 'ffn_hidden_size': fhs,
301
+ # 'num_experts': ne,
302
+ # }
303
+ # log_benchmark(
304
+ # '0:GradX:DDD::NN',
305
+ # arguments,
306
+ # mean_t,
307
+ # std_t,
308
+ # x.numel() * fhs * 2,
309
+ # )
310
+ #
311
+ # @parameterized.parameters(*_MATMUL_TESTS)
312
+ # def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
313
+ # assert (sl % ne) == 0
314
+ # x = torch.randn((ne, sl // ne, hs)).cuda().half()
315
+ # w = torch.randn((ne, hs, fhs)).cuda().half()
316
+ # out = torch.bmm(x, w)
317
+ # out = out.transpose(1, 2)
318
+ #
319
+ # def benchmark():
320
+ # return torch.bmm(out, x)
321
+ #
322
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
323
+ # arguments = {
324
+ # 'sequence_length': sl,
325
+ # 'hidden_size': hs,
326
+ # 'ffn_hidden_size': fhs,
327
+ # 'num_experts': ne,
328
+ # }
329
+ # log_benchmark(
330
+ # '0:GradW:DDD::TN',
331
+ # arguments,
332
+ # mean_t,
333
+ # std_t,
334
+ # x.numel() * fhs * 2,
335
+ # )
336
+ #
337
+ # @parameterized.parameters(*_MATMUL_TESTS)
338
+ # def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
339
+ # assert (sl % ne) == 0
340
+ # x = torch.randn((ne, sl // ne, fhs)).cuda().half()
341
+ # w = torch.randn((ne, fhs, hs)).cuda().half()
342
+ #
343
+ # def benchmark():
344
+ # return torch.bmm(x, w)
345
+ #
346
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
347
+ # arguments = {
348
+ # 'sequence_length': sl,
349
+ # 'hidden_size': hs,
350
+ # 'ffn_hidden_size': fhs,
351
+ # 'num_experts': ne,
352
+ # }
353
+ # log_benchmark(
354
+ # '1::Fwd::DDD::NN',
355
+ # arguments,
356
+ # mean_t,
357
+ # std_t,
358
+ # x.numel() * hs * 2,
359
+ # )
360
+ #
361
+ # @parameterized.parameters(*_MATMUL_TESTS)
362
+ # def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
363
+ # assert (sl % ne) == 0
364
+ # x = torch.randn((ne, sl // ne, fhs)).cuda().half()
365
+ # w = torch.randn((ne, fhs, hs)).cuda().half()
366
+ # out = torch.bmm(x, w)
367
+ # w = torch.transpose(w, 1, 2)
368
+ #
369
+ # def benchmark():
370
+ # return torch.bmm(out, w)
371
+ #
372
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
373
+ # arguments = {
374
+ # 'sequence_length': sl,
375
+ # 'hidden_size': hs,
376
+ # 'ffn_hidden_size': fhs,
377
+ # 'num_experts': ne,
378
+ # }
379
+ # log_benchmark(
380
+ # '1::GradX::DDD::NT',
381
+ # arguments,
382
+ # mean_t,
383
+ # std_t,
384
+ # x.numel() * hs * 2,
385
+ # )
386
+ #
387
+ # @parameterized.parameters(*_MATMUL_TESTS)
388
+ # def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
389
+ # assert (sl % ne) == 0
390
+ # x = torch.randn((ne, sl // ne, fhs)).cuda().half()
391
+ # w = torch.randn((ne, fhs, hs)).cuda().half()
392
+ # out = torch.bmm(x, w)
393
+ # x = torch.transpose(x, 1, 2)
394
+ #
395
+ # def benchmark():
396
+ # return torch.bmm(x, out)
397
+ #
398
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
399
+ # arguments = {
400
+ # 'sequence_length': sl,
401
+ # 'hidden_size': hs,
402
+ # 'ffn_hidden_size': fhs,
403
+ # 'num_experts': ne,
404
+ # }
405
+ # log_benchmark(
406
+ # '1::GradW::DDD::TN',
407
+ # arguments,
408
+ # mean_t,
409
+ # std_t,
410
+ # x.numel() * hs * 2,
411
+ # )
412
 
413
 
414
  if __name__ == '__main__':
build/torch210-cxx11-cu128-x86_64-linux/ops/padded_scatter_benchmark.py CHANGED
@@ -4,7 +4,7 @@
4
  import unittest
5
 
6
  import torch
7
- from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
@@ -16,50 +16,50 @@ _PADDED_SCATTER_BENCHMARK = (
16
  )
17
 
18
 
19
- class PaddedScatterTest(parameterized.TestCase):
20
-
21
- @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
22
- def testPaddedScatter(self, sl, hs, ne, top_k):
23
- # Create the data and indices.
24
- x = torch.randn((sl, hs)).cuda().half()
25
-
26
- # Randomly assign tokens to experts.
27
- top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
28
- bin_ids, indices = ops.sort(top_expert)
29
- tokens_per_expert = ops.histogram(top_expert, ne)
30
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
31
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
32
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
33
-
34
- # Sample weights for the scatter reduce.
35
- weights = torch.rand((sl * top_k,)).cuda().half()
36
-
37
- # Gather the data to prepare for backwards.
38
- x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
39
-
40
- def benchmark():
41
- return ops.padded_scatter(
42
- x,
43
- indices,
44
- bin_ids,
45
- weights,
46
- bins,
47
- padded_bins,
48
- top_k,
49
- )
50
-
51
- time, std = benchmark_util.benchmark_function(benchmark)
52
- benchmark_util.log_benchmark(
53
- 'Padded Scatter',
54
- {
55
- 'sequence_length': sl,
56
- 'hidden_size': hs,
57
- 'num_experts': ne,
58
- 'top_k': top_k,
59
- },
60
- time,
61
- std,
62
- )
63
 
64
 
65
  if __name__ == '__main__':
 
4
  import unittest
5
 
6
  import torch
7
+ # from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
 
16
  )
17
 
18
 
19
+ # class PaddedScatterTest(parameterized.TestCase):
20
+ #
21
+ # @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
22
+ # def testPaddedScatter(self, sl, hs, ne, top_k):
23
+ # # Create the data and indices.
24
+ # x = torch.randn((sl, hs)).cuda().half()
25
+ #
26
+ # # Randomly assign tokens to experts.
27
+ # top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
28
+ # bin_ids, indices = ops.sort(top_expert)
29
+ # tokens_per_expert = ops.histogram(top_expert, ne)
30
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
31
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
32
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
33
+ #
34
+ # # Sample weights for the scatter reduce.
35
+ # weights = torch.rand((sl * top_k,)).cuda().half()
36
+ #
37
+ # # Gather the data to prepare for backwards.
38
+ # x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
39
+ #
40
+ # def benchmark():
41
+ # return ops.padded_scatter(
42
+ # x,
43
+ # indices,
44
+ # bin_ids,
45
+ # weights,
46
+ # bins,
47
+ # padded_bins,
48
+ # top_k,
49
+ # )
50
+ #
51
+ # time, std = benchmark_util.benchmark_function(benchmark)
52
+ # benchmark_util.log_benchmark(
53
+ # 'Padded Scatter',
54
+ # {
55
+ # 'sequence_length': sl,
56
+ # 'hidden_size': hs,
57
+ # 'num_experts': ne,
58
+ # 'top_k': top_k,
59
+ # },
60
+ # time,
61
+ # std,
62
+ # )
63
 
64
 
65
  if __name__ == '__main__':
build/torch210-cxx11-cu128-x86_64-linux/ops/permute_benchmark.py CHANGED
@@ -4,7 +4,7 @@
4
  import unittest
5
 
6
  import torch
7
- from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
@@ -26,123 +26,123 @@ _PERMUTE_TESTS = (
26
  )
27
 
28
 
29
- class PermuteBenchmark(parameterized.TestCase):
30
-
31
- @parameterized.parameters(*_PERMUTE_TESTS)
32
- def testBinnedGather(self, sl, hs, ne):
33
- # NOTE: Capacity factor == 1.
34
- ec = sl // ne
35
-
36
- # Create the data and indices.
37
- x = torch.randn((sl, hs)).cuda().half()
38
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
39
- bin_ids, indices = ops.sort(top_expert)
40
- tokens_per_expert = ops.histogram(indices, ne)
41
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
42
-
43
- def benchmark():
44
- return ops.binned_gather(x, indices, bins, ec)
45
-
46
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
47
- arguments = {
48
- 'sequence_length': sl,
49
- 'hidden_size': hs,
50
- 'num_experts': ne,
51
- }
52
- benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
53
-
54
- @parameterized.parameters(*_PERMUTE_TESTS)
55
- def testBinnedScatter(self, sl, hs, ne):
56
- # NOTE: Capacity factor == 1.
57
- ec = sl // ne
58
-
59
- # Create the data and indices.
60
- x = torch.randn((sl, hs)).cuda().half()
61
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
62
- bin_ids, indices = ops.sort(top_expert)
63
- tokens_per_expert = ops.histogram(indices, ne)
64
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
65
- x = ops.binned_gather(x, indices, bins, ec)
66
-
67
- def benchmark():
68
- return ops.binned_scatter(x, indices, bins)
69
-
70
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
71
- arguments = {
72
- 'sequence_length': sl,
73
- 'hidden_size': hs,
74
- 'num_experts': ne,
75
- }
76
- benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
77
-
78
- @parameterized.parameters(*_PERMUTE_TESTS)
79
- def testPaddedGather(self, sl, hs, ne):
80
- # Create the data and indices.
81
- x = torch.randn((sl, hs)).cuda().half()
82
-
83
- # Randomly assign tokens to experts.
84
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
85
- bin_ids, indices = ops.sort(top_expert)
86
- tokens_per_expert = ops.histogram(top_expert, ne)
87
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
88
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
89
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
90
-
91
- def benchmark():
92
- return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
93
-
94
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
95
- arguments = {
96
- 'sequence_length': sl,
97
- 'hidden_size': hs,
98
- 'num_experts': ne,
99
- }
100
- benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
101
-
102
- @parameterized.parameters(*_PERMUTE_TESTS)
103
- def testPaddedScatter(self, sl, hs, ne):
104
- # Create the data and indices.
105
- x = torch.randn((sl, hs)).cuda().half()
106
-
107
- # Randomly assign tokens to experts.
108
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
109
- bin_ids, indices = ops.sort(top_expert)
110
- tokens_per_expert = ops.histogram(top_expert, ne)
111
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
112
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
113
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
114
- x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
115
-
116
- def benchmark():
117
- return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
118
-
119
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
- arguments = {
121
- 'sequence_length': sl,
122
- 'hidden_size': hs,
123
- 'num_experts': ne,
124
- }
125
- benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
126
-
127
- @parameterized.parameters(*_PERMUTE_TESTS)
128
- def testCopy(self, sl, hs, ne):
129
- # NOTE: Capacity factor == 1.
130
- # ec = sl // ne
131
-
132
- # Create the data and indices.
133
- x = torch.randn((sl, hs)).cuda().half()
134
- y = x.clone()
135
-
136
- def benchmark():
137
- return y.copy_(x)
138
-
139
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
140
- arguments = {
141
- 'sequence_length': sl,
142
- 'hidden_size': hs,
143
- 'num_experts': ne,
144
- }
145
- benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
146
 
147
 
148
  if __name__ == '__main__':
 
4
  import unittest
5
 
6
  import torch
7
+ # from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
 
26
  )
27
 
28
 
29
+ # class PermuteBenchmark(parameterized.TestCase):
30
+ #
31
+ # @parameterized.parameters(*_PERMUTE_TESTS)
32
+ # def testBinnedGather(self, sl, hs, ne):
33
+ # # NOTE: Capacity factor == 1.
34
+ # ec = sl // ne
35
+ #
36
+ # # Create the data and indices.
37
+ # x = torch.randn((sl, hs)).cuda().half()
38
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
39
+ # bin_ids, indices = ops.sort(top_expert)
40
+ # tokens_per_expert = ops.histogram(indices, ne)
41
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
42
+ #
43
+ # def benchmark():
44
+ # return ops.binned_gather(x, indices, bins, ec)
45
+ #
46
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
47
+ # arguments = {
48
+ # 'sequence_length': sl,
49
+ # 'hidden_size': hs,
50
+ # 'num_experts': ne,
51
+ # }
52
+ # benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
53
+ #
54
+ # @parameterized.parameters(*_PERMUTE_TESTS)
55
+ # def testBinnedScatter(self, sl, hs, ne):
56
+ # # NOTE: Capacity factor == 1.
57
+ # ec = sl // ne
58
+ #
59
+ # # Create the data and indices.
60
+ # x = torch.randn((sl, hs)).cuda().half()
61
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
62
+ # bin_ids, indices = ops.sort(top_expert)
63
+ # tokens_per_expert = ops.histogram(indices, ne)
64
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
65
+ # x = ops.binned_gather(x, indices, bins, ec)
66
+ #
67
+ # def benchmark():
68
+ # return ops.binned_scatter(x, indices, bins)
69
+ #
70
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
71
+ # arguments = {
72
+ # 'sequence_length': sl,
73
+ # 'hidden_size': hs,
74
+ # 'num_experts': ne,
75
+ # }
76
+ # benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
77
+ #
78
+ # @parameterized.parameters(*_PERMUTE_TESTS)
79
+ # def testPaddedGather(self, sl, hs, ne):
80
+ # # Create the data and indices.
81
+ # x = torch.randn((sl, hs)).cuda().half()
82
+ #
83
+ # # Randomly assign tokens to experts.
84
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
85
+ # bin_ids, indices = ops.sort(top_expert)
86
+ # tokens_per_expert = ops.histogram(top_expert, ne)
87
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
88
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
89
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
90
+ #
91
+ # def benchmark():
92
+ # return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
93
+ #
94
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
95
+ # arguments = {
96
+ # 'sequence_length': sl,
97
+ # 'hidden_size': hs,
98
+ # 'num_experts': ne,
99
+ # }
100
+ # benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
101
+ #
102
+ # @parameterized.parameters(*_PERMUTE_TESTS)
103
+ # def testPaddedScatter(self, sl, hs, ne):
104
+ # # Create the data and indices.
105
+ # x = torch.randn((sl, hs)).cuda().half()
106
+ #
107
+ # # Randomly assign tokens to experts.
108
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
109
+ # bin_ids, indices = ops.sort(top_expert)
110
+ # tokens_per_expert = ops.histogram(top_expert, ne)
111
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
112
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
113
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
114
+ # x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
115
+ #
116
+ # def benchmark():
117
+ # return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
118
+ #
119
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
+ # arguments = {
121
+ # 'sequence_length': sl,
122
+ # 'hidden_size': hs,
123
+ # 'num_experts': ne,
124
+ # }
125
+ # benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
126
+ #
127
+ # @parameterized.parameters(*_PERMUTE_TESTS)
128
+ # def testCopy(self, sl, hs, ne):
129
+ # # NOTE: Capacity factor == 1.
130
+ # # ec = sl // ne
131
+ #
132
+ # # Create the data and indices.
133
+ # x = torch.randn((sl, hs)).cuda().half()
134
+ # y = x.clone()
135
+ #
136
+ # def benchmark():
137
+ # return y.copy_(x)
138
+ #
139
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
140
+ # arguments = {
141
+ # 'sequence_length': sl,
142
+ # 'hidden_size': hs,
143
+ # 'num_experts': ne,
144
+ # }
145
+ # benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
146
 
147
 
148
  if __name__ == '__main__':
build/torch210-cxx11-cu128-x86_64-linux/ops/sort_benchmark.py CHANGED
@@ -5,7 +5,7 @@ import unittest
5
 
6
  import numpy as np
7
  import torch
8
- from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
@@ -53,32 +53,32 @@ def log_benchmark(arguments, mean_t, std_t):
53
  print('=' * 60)
54
 
55
 
56
- class SortBenchmark(parameterized.TestCase):
57
-
58
- @parameterized.parameters(*_SORT_TESTS)
59
- def testSort(self, n, dtype, max_val):
60
- if max_val is None:
61
- max_val = np.iinfo(numpy_dtype(dtype)).max
62
- end_bit = int(np.ceil(np.log2(max_val)))
63
- x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
64
-
65
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
66
- arguments = {
67
- 'n': n,
68
- 'dtype': dtype,
69
- 'max_val': max_val,
70
- }
71
- log_benchmark(arguments, mean_t, std_t)
72
-
73
- @parameterized.parameters(*_BASELINE_SORT_TESTS)
74
- def testTorchSort(self, n):
75
- x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
76
-
77
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
78
- arguments = {
79
- 'n': n,
80
- }
81
- log_benchmark(arguments, mean_t, std_t)
82
 
83
 
84
  if __name__ == '__main__':
 
5
 
6
  import numpy as np
7
  import torch
8
+ # from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
 
53
  print('=' * 60)
54
 
55
 
56
+ # class SortBenchmark(parameterized.TestCase):
57
+ #
58
+ # @parameterized.parameters(*_SORT_TESTS)
59
+ # def testSort(self, n, dtype, max_val):
60
+ # if max_val is None:
61
+ # max_val = np.iinfo(numpy_dtype(dtype)).max
62
+ # end_bit = int(np.ceil(np.log2(max_val)))
63
+ # x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
64
+ #
65
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
66
+ # arguments = {
67
+ # 'n': n,
68
+ # 'dtype': dtype,
69
+ # 'max_val': max_val,
70
+ # }
71
+ # log_benchmark(arguments, mean_t, std_t)
72
+ #
73
+ # @parameterized.parameters(*_BASELINE_SORT_TESTS)
74
+ # def testTorchSort(self, n):
75
+ # x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
76
+ #
77
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
78
+ # arguments = {
79
+ # 'n': n,
80
+ # }
81
+ # log_benchmark(arguments, mean_t, std_t)
82
 
83
 
84
  if __name__ == '__main__':
build/torch210-cxx11-cu128-x86_64-linux/stk/ops/eltwise_ops_test.py CHANGED
@@ -1,7 +1,7 @@
1
  import unittest
2
  import itertools
3
  import torch
4
- from absl.testing import parameterized
5
 
6
  import stk
7
  from stk.ops.linear_ops_test import allclose, _dense_and_sparse
@@ -47,40 +47,40 @@ def _dense_and_sparse_like(x, std=0.1):
47
  return (dense.requires_grad_(True),
48
  sparse.requires_grad_(True))
49
 
50
- @parameterized.parameters(_ELTWISE_OP_TESTS)
51
- class EltwiseOpsTest(parameterized.TestCase):
52
-
53
- def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
54
-
55
- a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
56
- b_dense, b = _dense_and_sparse_like(a)
57
-
58
- out = stk.ops.mul(a, b)
59
- expected_out = torch.mul(a_dense, b_dense)
60
-
61
- # Compute the gradients w.r.t. the inputs.
62
- expected_out.sum().backward()
63
- stk.ops.sum(out).backward()
64
-
65
- # Validate the results.
66
- out = stk.ops.to_dense(out)
67
- self.assertEqual(out.dim(), 2)
68
- self.assertEqual(expected_out.size(), out.size())
69
- self.assertTrue(allclose(out, expected_out))
70
-
71
- # LHS gradient.
72
- grad = stk.ops.to_dense(a.grad)
73
- expected_grad = a_dense.grad
74
- self.assertEqual(grad.dim(), 2)
75
- self.assertEqual(expected_grad.size(), grad.size())
76
- self.assertTrue(allclose(grad, expected_grad))
77
-
78
- # RHS gradient.
79
- grad = stk.ops.to_dense(b.grad)
80
- expected_grad = b_dense.grad
81
- self.assertEqual(grad.dim(), 2)
82
- self.assertEqual(expected_grad.size(), grad.size())
83
- self.assertTrue(allclose(grad, expected_grad))
84
 
85
  if __name__ == '__main__':
86
  unittest.main()
 
1
  import unittest
2
  import itertools
3
  import torch
4
+ # from absl.testing import parameterized
5
 
6
  import stk
7
  from stk.ops.linear_ops_test import allclose, _dense_and_sparse
 
47
  return (dense.requires_grad_(True),
48
  sparse.requires_grad_(True))
49
 
50
+ # @parameterized.parameters(_ELTWISE_OP_TESTS)
51
+ # class EltwiseOpsTest(parameterized.TestCase):
52
+ #
53
+ # def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
54
+ #
55
+ # a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
56
+ # b_dense, b = _dense_and_sparse_like(a)
57
+ #
58
+ # out = stk.ops.mul(a, b)
59
+ # expected_out = torch.mul(a_dense, b_dense)
60
+ #
61
+ # # Compute the gradients w.r.t. the inputs.
62
+ # expected_out.sum().backward()
63
+ # stk.ops.sum(out).backward()
64
+ #
65
+ # # Validate the results.
66
+ # out = stk.ops.to_dense(out)
67
+ # self.assertEqual(out.dim(), 2)
68
+ # self.assertEqual(expected_out.size(), out.size())
69
+ # self.assertTrue(allclose(out, expected_out))
70
+ #
71
+ # # LHS gradient.
72
+ # grad = stk.ops.to_dense(a.grad)
73
+ # expected_grad = a_dense.grad
74
+ # self.assertEqual(grad.dim(), 2)
75
+ # self.assertEqual(expected_grad.size(), grad.size())
76
+ # self.assertTrue(allclose(grad, expected_grad))
77
+ #
78
+ # # RHS gradient.
79
+ # grad = stk.ops.to_dense(b.grad)
80
+ # expected_grad = b_dense.grad
81
+ # self.assertEqual(grad.dim(), 2)
82
+ # self.assertEqual(expected_grad.size(), grad.size())
83
+ # self.assertTrue(allclose(grad, expected_grad))
84
 
85
  if __name__ == '__main__':
86
  unittest.main()
build/torch210-cxx11-cu128-x86_64-linux/stk/ops/linear_ops_test.py CHANGED
@@ -2,7 +2,7 @@ import unittest
2
  import itertools
3
  import numpy as np
4
  import torch
5
- from absl.testing import parameterized
6
 
7
  import stk
8
 
@@ -96,121 +96,121 @@ def _mask(x, mask):
96
  return x * mask
97
 
98
 
99
- @parameterized.parameters(*_LINEAR_OP_TESTS)
100
- class LinearOpsTest(parameterized.TestCase):
101
-
102
- def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
103
- # Construct the operands.
104
- a_shape = (k, m) if trans_a else (m, k)
105
- a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
106
- b_shape = (n, k) if trans_b else (k, n)
107
- b, bcp = _dense_2x(*b_shape, dtype)
108
-
109
- # Execute the matmul.
110
- out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
111
- expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
112
-
113
- # Compute the gradients w.r.t. the inputs.
114
- expected_out.sum().backward()
115
- out.sum().backward()
116
-
117
- # Validate the results.
118
- self.assertEqual(out.dim(), 2)
119
- self.assertEqual(expected_out.size()[0], out.size()[0])
120
- self.assertEqual(expected_out.size()[1], out.size()[1])
121
- self.assertTrue(allclose(out, expected_out))
122
-
123
- # LHS gradient.
124
- grad = stk.ops.to_dense(a.grad)
125
- expected_grad = _mask(a_dense.grad, a.grad)
126
- self.assertEqual(grad.dim(), 2)
127
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
128
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
129
- self.assertTrue(allclose(grad, expected_grad))
130
-
131
- # RHS gradient.
132
- grad = b.grad
133
- expected_grad = bcp.grad
134
- self.assertEqual(grad.dim(), 2)
135
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
136
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
137
- self.assertTrue(allclose(grad, expected_grad))
138
-
139
- def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
140
- # Construct the operands.
141
- a_shape = (k, m) if trans_a else (m, k)
142
- a, acp = _dense_2x(*a_shape, dtype)
143
- b_shape = (n, k) if trans_b else (k, n)
144
- b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
145
-
146
- # Execute the matmul.
147
- out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
148
- expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
149
-
150
- # Compute the gradients w.r.t. the inputs.
151
- expected_out.sum().backward()
152
- out.sum().backward()
153
-
154
- # Validate the results.
155
- self.assertEqual(out.dim(), 2)
156
- self.assertEqual(expected_out.size()[0], out.size()[0])
157
- self.assertEqual(expected_out.size()[1], out.size()[1])
158
- self.assertTrue(allclose(out, expected_out))
159
-
160
- # LHS gradient.
161
- grad = a.grad
162
- expected_grad = acp.grad
163
- self.assertEqual(grad.dim(), 2)
164
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
165
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
166
- self.assertTrue(allclose(grad, expected_grad))
167
-
168
- # RHS gradient.
169
- grad = stk.ops.to_dense(b.grad)
170
- expected_grad = _mask(b_dense.grad, b.grad)
171
- self.assertEqual(grad.dim(), 2)
172
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
173
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
174
- self.assertTrue(allclose(grad, expected_grad))
175
-
176
- def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
177
- # Construct the operands.
178
- a_shape = (k, m) if trans_a else (m, k)
179
- a, acp = _dense_2x(*a_shape, dtype)
180
- b_shape = (n, k) if trans_b else (k, n)
181
- b, bcp = _dense_2x(*b_shape, dtype)
182
- _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
183
-
184
- # Execute the matmul.
185
- out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
186
- expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
187
-
188
- # Compute the gradients w.r.t. the inputs.
189
- expected_out.sum().backward()
190
- stk.ops.sum(out).backward()
191
-
192
- # Validate the results.
193
- out = stk.ops.to_dense(out)
194
- self.assertEqual(out.dim(), 2)
195
- self.assertEqual(expected_out.size()[0], out.size()[0])
196
- self.assertEqual(expected_out.size()[1], out.size()[1])
197
- self.assertTrue(allclose(out, expected_out))
198
-
199
- # LHS gradient.
200
- grad = a.grad
201
- expected_grad = acp.grad
202
- self.assertEqual(grad.dim(), 2)
203
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
204
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
205
- self.assertTrue(allclose(grad, expected_grad))
206
-
207
- # RHS gradient.
208
- grad = b.grad
209
- expected_grad = bcp.grad
210
- self.assertEqual(grad.dim(), 2)
211
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
212
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
213
- self.assertTrue(allclose(grad, expected_grad))
214
 
215
  if __name__ == '__main__':
216
  unittest.main()
 
2
  import itertools
3
  import numpy as np
4
  import torch
5
+ # from absl.testing import parameterized
6
 
7
  import stk
8
 
 
96
  return x * mask
97
 
98
 
99
+ # @parameterized.parameters(*_LINEAR_OP_TESTS)
100
+ # class LinearOpsTest(parameterized.TestCase):
101
+ #
102
+ # def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
103
+ # # Construct the operands.
104
+ # a_shape = (k, m) if trans_a else (m, k)
105
+ # a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
106
+ # b_shape = (n, k) if trans_b else (k, n)
107
+ # b, bcp = _dense_2x(*b_shape, dtype)
108
+ #
109
+ # # Execute the matmul.
110
+ # out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
111
+ # expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
112
+ #
113
+ # # Compute the gradients w.r.t. the inputs.
114
+ # expected_out.sum().backward()
115
+ # out.sum().backward()
116
+ #
117
+ # # Validate the results.
118
+ # self.assertEqual(out.dim(), 2)
119
+ # self.assertEqual(expected_out.size()[0], out.size()[0])
120
+ # self.assertEqual(expected_out.size()[1], out.size()[1])
121
+ # self.assertTrue(allclose(out, expected_out))
122
+ #
123
+ # # LHS gradient.
124
+ # grad = stk.ops.to_dense(a.grad)
125
+ # expected_grad = _mask(a_dense.grad, a.grad)
126
+ # self.assertEqual(grad.dim(), 2)
127
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
128
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
129
+ # self.assertTrue(allclose(grad, expected_grad))
130
+ #
131
+ # # RHS gradient.
132
+ # grad = b.grad
133
+ # expected_grad = bcp.grad
134
+ # self.assertEqual(grad.dim(), 2)
135
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
136
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
137
+ # self.assertTrue(allclose(grad, expected_grad))
138
+ #
139
+ # def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
140
+ # # Construct the operands.
141
+ # a_shape = (k, m) if trans_a else (m, k)
142
+ # a, acp = _dense_2x(*a_shape, dtype)
143
+ # b_shape = (n, k) if trans_b else (k, n)
144
+ # b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
145
+ #
146
+ # # Execute the matmul.
147
+ # out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
148
+ # expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
149
+ #
150
+ # # Compute the gradients w.r.t. the inputs.
151
+ # expected_out.sum().backward()
152
+ # out.sum().backward()
153
+ #
154
+ # # Validate the results.
155
+ # self.assertEqual(out.dim(), 2)
156
+ # self.assertEqual(expected_out.size()[0], out.size()[0])
157
+ # self.assertEqual(expected_out.size()[1], out.size()[1])
158
+ # self.assertTrue(allclose(out, expected_out))
159
+ #
160
+ # # LHS gradient.
161
+ # grad = a.grad
162
+ # expected_grad = acp.grad
163
+ # self.assertEqual(grad.dim(), 2)
164
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
165
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
166
+ # self.assertTrue(allclose(grad, expected_grad))
167
+ #
168
+ # # RHS gradient.
169
+ # grad = stk.ops.to_dense(b.grad)
170
+ # expected_grad = _mask(b_dense.grad, b.grad)
171
+ # self.assertEqual(grad.dim(), 2)
172
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
173
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
174
+ # self.assertTrue(allclose(grad, expected_grad))
175
+ #
176
+ # def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
177
+ # # Construct the operands.
178
+ # a_shape = (k, m) if trans_a else (m, k)
179
+ # a, acp = _dense_2x(*a_shape, dtype)
180
+ # b_shape = (n, k) if trans_b else (k, n)
181
+ # b, bcp = _dense_2x(*b_shape, dtype)
182
+ # _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
183
+ #
184
+ # # Execute the matmul.
185
+ # out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
186
+ # expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
187
+ #
188
+ # # Compute the gradients w.r.t. the inputs.
189
+ # expected_out.sum().backward()
190
+ # stk.ops.sum(out).backward()
191
+ #
192
+ # # Validate the results.
193
+ # out = stk.ops.to_dense(out)
194
+ # self.assertEqual(out.dim(), 2)
195
+ # self.assertEqual(expected_out.size()[0], out.size()[0])
196
+ # self.assertEqual(expected_out.size()[1], out.size()[1])
197
+ # self.assertTrue(allclose(out, expected_out))
198
+ #
199
+ # # LHS gradient.
200
+ # grad = a.grad
201
+ # expected_grad = acp.grad
202
+ # self.assertEqual(grad.dim(), 2)
203
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
204
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
205
+ # self.assertTrue(allclose(grad, expected_grad))
206
+ #
207
+ # # RHS gradient.
208
+ # grad = b.grad
209
+ # expected_grad = bcp.grad
210
+ # self.assertEqual(grad.dim(), 2)
211
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
212
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
213
+ # self.assertTrue(allclose(grad, expected_grad))
214
 
215
  if __name__ == '__main__':
216
  unittest.main()
build/torch210-cxx11-cu128-x86_64-linux/stk/ops/matrix_ops_test.py CHANGED
@@ -1,61 +1,61 @@
1
  import unittest
2
 
3
- from absl.testing import parameterized
4
  import stk
5
  import torch
6
 
7
 
8
- @parameterized.parameters(
9
- (8, 16, 0.0, 1),
10
- (8, 16, 0.5, 1),
11
- (8, 16, .95, 1),
12
- (16, 8, 0.0, 1),
13
- (16, 8, 0.5, 1),
14
- (16, 8, .95, 1),
15
- (8, 16, 0.0, 8),
16
- (8, 16, 0.5, 8),
17
- (8, 16, 1.0, 8),
18
- (16, 8, 0.0, 8),
19
- (16, 8, 0.5, 8),
20
- (16, 8, 1.0, 8),
21
- (128, 256, 0.5, 16),
22
- (256, 128, 0.75, 32),
23
- (512, 512, .875, 128))
24
- class MatrixOpsTest(parameterized.TestCase):
25
-
26
- def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
27
- mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
28
- x = (torch.randn(rows, cols) * mask).type(torch.float16)
29
-
30
- # Convert the matrix to sparse format.
31
- sparse_x = stk.ops.to_sparse(x, blocking)
32
-
33
- # Validate the matrix.
34
- sparse_x.validate()
35
-
36
- # Validate the shape.
37
- self.assertEqual(sparse_x.dim(), 2)
38
- self.assertEqual(sparse_x.size()[0], rows)
39
- self.assertEqual(sparse_x.size()[1], cols)
40
-
41
- # Validate the sparsity.
42
- numblocks = rows // blocking * cols // blocking
43
- nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
44
- self.assertEqual(sparse_x.nnz, nnz)
45
-
46
- # Convert back to dense format.
47
- dense_x = stk.ops.to_dense(sparse_x)
48
-
49
- # Validate the shape.
50
- self.assertEqual(dense_x.dim(), 2)
51
- self.assertEqual(dense_x.size()[0], rows)
52
- self.assertEqual(dense_x.size()[1], cols)
53
-
54
- # Validate the sparsity
55
- self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
56
-
57
- # Validate the output.
58
- self.assertTrue(torch.all(torch.eq(x, dense_x)))
59
 
60
 
61
  if __name__ == '__main__':
 
1
  import unittest
2
 
3
+ # from absl.testing import parameterized
4
  import stk
5
  import torch
6
 
7
 
8
+ # @parameterized.parameters(
9
+ # (8, 16, 0.0, 1),
10
+ # (8, 16, 0.5, 1),
11
+ # (8, 16, .95, 1),
12
+ # (16, 8, 0.0, 1),
13
+ # (16, 8, 0.5, 1),
14
+ # (16, 8, .95, 1),
15
+ # (8, 16, 0.0, 8),
16
+ # (8, 16, 0.5, 8),
17
+ # (8, 16, 1.0, 8),
18
+ # (16, 8, 0.0, 8),
19
+ # (16, 8, 0.5, 8),
20
+ # (16, 8, 1.0, 8),
21
+ # (128, 256, 0.5, 16),
22
+ # (256, 128, 0.75, 32),
23
+ # (512, 512, .875, 128))
24
+ # class MatrixOpsTest(parameterized.TestCase):
25
+ #
26
+ # def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
27
+ # mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
28
+ # x = (torch.randn(rows, cols) * mask).type(torch.float16)
29
+ #
30
+ # # Convert the matrix to sparse format.
31
+ # sparse_x = stk.ops.to_sparse(x, blocking)
32
+ #
33
+ # # Validate the matrix.
34
+ # sparse_x.validate()
35
+ #
36
+ # # Validate the shape.
37
+ # self.assertEqual(sparse_x.dim(), 2)
38
+ # self.assertEqual(sparse_x.size()[0], rows)
39
+ # self.assertEqual(sparse_x.size()[1], cols)
40
+ #
41
+ # # Validate the sparsity.
42
+ # numblocks = rows // blocking * cols // blocking
43
+ # nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
44
+ # self.assertEqual(sparse_x.nnz, nnz)
45
+ #
46
+ # # Convert back to dense format.
47
+ # dense_x = stk.ops.to_dense(sparse_x)
48
+ #
49
+ # # Validate the shape.
50
+ # self.assertEqual(dense_x.dim(), 2)
51
+ # self.assertEqual(dense_x.size()[0], rows)
52
+ # self.assertEqual(dense_x.size()[1], cols)
53
+ #
54
+ # # Validate the sparsity
55
+ # self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
56
+ #
57
+ # # Validate the output.
58
+ # self.assertTrue(torch.all(torch.eq(x, dense_x)))
59
 
60
 
61
  if __name__ == '__main__':
build/torch210-cxx11-cu128-x86_64-linux/stk/random/random_ops_test.py CHANGED
@@ -1,72 +1,72 @@
1
  import unittest
2
 
3
- from absl.testing import parameterized
4
  from . import random
5
  import torch
6
 
7
 
8
- @parameterized.parameters(
9
- (8, 16, 0.0, 1),
10
- (8, 16, 0.5, 1),
11
- (8, 16, .95, 1),
12
- (16, 8, 0.0, 1),
13
- (16, 8, 0.5, 1),
14
- (16, 8, .95, 1),
15
- (8, 16, 0.0, 8),
16
- (8, 16, 0.5, 8),
17
- (8, 16, 1.0, 8),
18
- (16, 8, 0.0, 8),
19
- (16, 8, 0.5, 8),
20
- (16, 8, 1.0, 8),
21
- (128, 256, 0.5, 16),
22
- (256, 128, 0.75, 32),
23
- (512, 512, .875, 128))
24
- class RandomOpsTest(parameterized.TestCase):
25
-
26
- def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
27
- mask = random.dense_mask(
28
- rows, cols, sparsity, blocking)
29
-
30
- # Validate the shape.
31
- self.assertEqual(mask.dim(), 2)
32
- self.assertEqual(mask.size()[0], rows)
33
- self.assertEqual(mask.size()[1], cols)
34
-
35
- # Validate the sparsity
36
- numblocks = rows // blocking * cols // blocking
37
- nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
38
- self.assertEqual(
39
- torch.count_nonzero(mask).item(),
40
- nnz)
41
-
42
- # Check values are zero or one.
43
- self.assertTrue(
44
- torch.all(torch.logical_or(
45
- torch.eq(mask, 0),
46
- torch.eq(mask, 1))))
47
-
48
- def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
49
- mask = random.mask(
50
- rows, cols, sparsity, blocking)
51
-
52
- # Validate the matrix.
53
- mask.validate()
54
-
55
- # Validate the shape.
56
- self.assertEqual(mask.dim(), 2)
57
- self.assertEqual(mask.size()[0], rows)
58
- self.assertEqual(mask.size()[1], cols)
59
-
60
- # Validate the sparsity.
61
- numblocks = rows // blocking * cols // blocking
62
- nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
63
- self.assertEqual(mask.nnz, nnz)
64
-
65
- # Check values are zero or one.
66
- self.assertTrue(
67
- torch.all(torch.logical_or(
68
- torch.eq(mask.data, 0),
69
- torch.eq(mask.data, 1))))
70
 
71
 
72
  if __name__ == '__main__':
 
1
  import unittest
2
 
3
+ # from absl.testing import parameterized
4
  from . import random
5
  import torch
6
 
7
 
8
+ # @parameterized.parameters(
9
+ # (8, 16, 0.0, 1),
10
+ # (8, 16, 0.5, 1),
11
+ # (8, 16, .95, 1),
12
+ # (16, 8, 0.0, 1),
13
+ # (16, 8, 0.5, 1),
14
+ # (16, 8, .95, 1),
15
+ # (8, 16, 0.0, 8),
16
+ # (8, 16, 0.5, 8),
17
+ # (8, 16, 1.0, 8),
18
+ # (16, 8, 0.0, 8),
19
+ # (16, 8, 0.5, 8),
20
+ # (16, 8, 1.0, 8),
21
+ # (128, 256, 0.5, 16),
22
+ # (256, 128, 0.75, 32),
23
+ # (512, 512, .875, 128))
24
+ # class RandomOpsTest(parameterized.TestCase):
25
+ #
26
+ # def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
27
+ # mask = random.dense_mask(
28
+ # rows, cols, sparsity, blocking)
29
+ #
30
+ # # Validate the shape.
31
+ # self.assertEqual(mask.dim(), 2)
32
+ # self.assertEqual(mask.size()[0], rows)
33
+ # self.assertEqual(mask.size()[1], cols)
34
+ #
35
+ # # Validate the sparsity
36
+ # numblocks = rows // blocking * cols // blocking
37
+ # nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
38
+ # self.assertEqual(
39
+ # torch.count_nonzero(mask).item(),
40
+ # nnz)
41
+ #
42
+ # # Check values are zero or one.
43
+ # self.assertTrue(
44
+ # torch.all(torch.logical_or(
45
+ # torch.eq(mask, 0),
46
+ # torch.eq(mask, 1))))
47
+ #
48
+ # def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
49
+ # mask = random.mask(
50
+ # rows, cols, sparsity, blocking)
51
+ #
52
+ # # Validate the matrix.
53
+ # mask.validate()
54
+ #
55
+ # # Validate the shape.
56
+ # self.assertEqual(mask.dim(), 2)
57
+ # self.assertEqual(mask.size()[0], rows)
58
+ # self.assertEqual(mask.size()[1], cols)
59
+ #
60
+ # # Validate the sparsity.
61
+ # numblocks = rows // blocking * cols // blocking
62
+ # nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
63
+ # self.assertEqual(mask.nnz, nnz)
64
+ #
65
+ # # Check values are zero or one.
66
+ # self.assertTrue(
67
+ # torch.all(torch.logical_or(
68
+ # torch.eq(mask.data, 0),
69
+ # torch.eq(mask.data, 1))))
70
 
71
 
72
  if __name__ == '__main__':
build/torch210-cxx11-cu130-x86_64-linux/{_megablocks_cuda_6e04dec.abi3.so → _megablocks_cuda_a45325d.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ed503a781293a9d6150e0362edbe9360ef6e58590b511ee23596649ee9a437d
3
  size 12041592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:391ee51a42c7bf87472426a9291154d2c9cf2f32be7826a24e09a0e7fd192e4c
3
  size 12041592
build/torch210-cxx11-cu130-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _megablocks_cuda_6e04dec
3
- ops = torch.ops._megablocks_cuda_6e04dec
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_megablocks_cuda_6e04dec::{op_name}"
 
1
  import torch
2
+ from . import _megablocks_cuda_a45325d
3
+ ops = torch.ops._megablocks_cuda_a45325d
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_megablocks_cuda_a45325d::{op_name}"
build/torch210-cxx11-cu130-x86_64-linux/megablocks/__init__.py CHANGED
@@ -1,10 +1,10 @@
1
  import ctypes
 
2
  import sys
3
-
4
- import importlib
5
  from pathlib import Path
6
  from types import ModuleType
7
 
 
8
  def _import_from_path(file_path: Path) -> ModuleType:
9
  # We cannot use the module name as-is, after adding it to `sys.modules`,
10
  # it would also be used for other imports. So, we make a module name that
 
1
  import ctypes
2
+ import importlib.util
3
  import sys
 
 
4
  from pathlib import Path
5
  from types import ModuleType
6
 
7
+
8
  def _import_from_path(file_path: Path) -> ModuleType:
9
  # We cannot use the module name as-is, after adding it to `sys.modules`,
10
  # it would also be used for other imports. So, we make a module name that
build/torch210-cxx11-cu130-x86_64-linux/ops/histogram_benchmark.py CHANGED
@@ -5,7 +5,7 @@ import unittest
5
 
6
  import numpy as np
7
  import torch
8
- from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
@@ -47,31 +47,31 @@ def log_benchmark(arguments, mean_t, std_t):
47
  print('=' * 60)
48
 
49
 
50
- class HistogramBenchmark(parameterized.TestCase):
51
-
52
- @parameterized.parameters(*_HISTOGRAM_TESTS)
53
- def testHistogram(self, n, dtype, max_val):
54
- x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
55
-
56
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
57
- arguments = {
58
- 'n': n,
59
- 'dtype': dtype,
60
- 'max_val': max_val,
61
- }
62
- log_benchmark(arguments, mean_t, std_t)
63
-
64
- @parameterized.parameters(*_HISTOGRAM_TESTS)
65
- def testTorchHistogram(self, n, dtype, max_val):
66
- x = torch.randint(0, 128, (n,)).cuda().to(dtype)
67
-
68
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
69
- arguments = {
70
- 'n': n,
71
- 'dtype': dtype,
72
- 'max_val': max_val,
73
- }
74
- log_benchmark(arguments, mean_t, std_t)
75
 
76
 
77
  if __name__ == '__main__':
 
5
 
6
  import numpy as np
7
  import torch
8
+ # from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
 
47
  print('=' * 60)
48
 
49
 
50
+ # class HistogramBenchmark(parameterized.TestCase):
51
+ #
52
+ # @parameterized.parameters(*_HISTOGRAM_TESTS)
53
+ # def testHistogram(self, n, dtype, max_val):
54
+ # x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
55
+ #
56
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.histogram(x, max_val),)
57
+ # arguments = {
58
+ # 'n': n,
59
+ # 'dtype': dtype,
60
+ # 'max_val': max_val,
61
+ # }
62
+ # log_benchmark(arguments, mean_t, std_t)
63
+ #
64
+ # @parameterized.parameters(*_HISTOGRAM_TESTS)
65
+ # def testTorchHistogram(self, n, dtype, max_val):
66
+ # x = torch.randint(0, 128, (n,)).cuda().to(dtype)
67
+ #
68
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.histc(x, max_val, 0, max_val - 1),)
69
+ # arguments = {
70
+ # 'n': n,
71
+ # 'dtype': dtype,
72
+ # 'max_val': max_val,
73
+ # }
74
+ # log_benchmark(arguments, mean_t, std_t)
75
 
76
 
77
  if __name__ == '__main__':
build/torch210-cxx11-cu130-x86_64-linux/ops/matmul_benchmark.py CHANGED
@@ -17,7 +17,7 @@ import unittest
17
  from .. import stk
18
 
19
  import torch
20
- from absl.testing import parameterized
21
 
22
  from .. import benchmark_util, ops
23
 
@@ -48,367 +48,367 @@ def log_benchmark(name, arguments, time, std, flops):
48
  print('=' * 60)
49
 
50
 
51
- class MatmulBenchmark(parameterized.TestCase):
52
-
53
- def build_sparse_matrix(self, x, padded_bins, fhs, ne):
54
- blocking = 128
55
- padded_tokens, _ = x.size()
56
- assert padded_tokens % blocking == 0
57
- assert fhs % blocking == 0
58
-
59
- # Offsets for the sparse matrix. All rows have the
60
- # same number of nonzero blocks dictated by the
61
- # dimensionality of a single expert.
62
- block_rows = padded_tokens // blocking
63
- blocks_per_row = fhs // blocking
64
- offsets = torch.arange(
65
- 0,
66
- block_rows * blocks_per_row + 1,
67
- blocks_per_row,
68
- dtype=torch.int32,
69
- device=x.device,
70
- )
71
-
72
- # Indices for the sparse matrix. The indices for
73
- # the intermediate matrix are dynamic depending
74
- # on the mapping of tokens to experts.
75
- column_indices = ops.topology(
76
- padded_bins,
77
- blocking,
78
- block_rows,
79
- blocks_per_row,
80
- )
81
- data = torch.empty(
82
- column_indices.numel(),
83
- blocking,
84
- blocking,
85
- dtype=torch.float16,
86
- device=x.device,
87
- )
88
- shape = (padded_tokens, fhs * ne)
89
- row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
90
- return stk.Matrix(shape, data, row_indices, column_indices, offsets)
91
-
92
- def build_input_matrix(self, sl, hs, ne):
93
- x = torch.randn((sl, hs)).cuda().half()
94
-
95
- # Assign tokens to experts uniformly.
96
- top_expert = torch.arange(0, sl).cuda().int() % ne
97
-
98
- bin_ids, indices = ops.sort(top_expert)
99
- tokens_per_expert = ops.histogram(top_expert, ne)
100
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
101
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
102
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
103
- out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
104
- return out, padded_bins
105
-
106
- def build_weight_matrix(self, ne, hs, fhs):
107
- return torch.randn((hs, ne * fhs)).cuda().half()
108
-
109
- @parameterized.parameters(*_MATMUL_TESTS)
110
- def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
111
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
112
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
113
- topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
114
- w = transpose_view(w)
115
-
116
- def benchmark():
117
- return stk.ops.sdd(x, w, topo)
118
-
119
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
- arguments = {
121
- 'sequence_length': sl,
122
- 'hidden_size': hs,
123
- 'ffn_hidden_size': fhs,
124
- 'num_experts': ne,
125
- }
126
- log_benchmark(
127
- '0::Fwd::SDD::NT',
128
- arguments,
129
- mean_t,
130
- std_t,
131
- x.numel() * fhs * 2,
132
- )
133
-
134
- @parameterized.parameters(*_MATMUL_TESTS)
135
- def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
136
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
137
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
138
- topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
139
-
140
- def benchmark():
141
- return stk.ops.dsd(topo, w)
142
-
143
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
144
- arguments = {
145
- 'sequence_length': sl,
146
- 'hidden_size': hs,
147
- 'ffn_hidden_size': fhs,
148
- 'num_experts': ne,
149
- }
150
- log_benchmark(
151
- '0::GradX::DSD::NN',
152
- arguments,
153
- mean_t,
154
- std_t,
155
- x.numel() * fhs * 2,
156
- )
157
-
158
- @parameterized.parameters(*_MATMUL_TESTS)
159
- def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
160
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
161
- topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
162
- topo = topo.t()
163
-
164
- def benchmark():
165
- return stk.ops.dsd(topo, x)
166
-
167
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
168
- arguments = {
169
- 'sequence_length': sl,
170
- 'hidden_size': hs,
171
- 'ffn_hidden_size': fhs,
172
- 'num_experts': ne,
173
- }
174
- log_benchmark(
175
- '0::GradW::DSD::TN',
176
- arguments,
177
- mean_t,
178
- std_t,
179
- x.numel() * fhs * 2,
180
- )
181
-
182
- @parameterized.parameters(*_MATMUL_TESTS)
183
- def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
184
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
185
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
186
- x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
187
-
188
- def benchmark():
189
- return stk.ops.dsd(x, w)
190
-
191
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
192
- arguments = {
193
- 'sequence_length': sl,
194
- 'hidden_size': hs,
195
- 'ffn_hidden_size': fhs,
196
- 'num_experts': ne,
197
- }
198
- log_benchmark(
199
- '1::Fwd::DSD::NN',
200
- arguments,
201
- mean_t,
202
- std_t,
203
- x.nnz * hs * 2,
204
- )
205
-
206
- @parameterized.parameters(*_MATMUL_TESTS)
207
- def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
208
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
209
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
210
- x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
211
- out = stk.ops.dsd(x, w)
212
- w = transpose_view(w)
213
-
214
- def benchmark():
215
- return stk.ops.sdd(out, w, x)
216
-
217
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
218
- arguments = {
219
- 'sequence_length': sl,
220
- 'hidden_size': hs,
221
- 'ffn_hidden_size': fhs,
222
- 'num_experts': ne,
223
- }
224
- log_benchmark(
225
- '1::GradX::SDD::NT',
226
- arguments,
227
- mean_t,
228
- std_t,
229
- x.nnz * hs * 2,
230
- )
231
-
232
- @parameterized.parameters(*_MATMUL_TESTS)
233
- def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
234
- x, padded_bins = self.build_input_matrix(sl, hs, ne)
235
- w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
236
- x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
237
- out = stk.ops.dsd(x, w)
238
- x = x.t()
239
-
240
- def benchmark():
241
- return stk.ops.dsd(x, out)
242
-
243
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
244
- arguments = {
245
- 'sequence_length': sl,
246
- 'hidden_size': hs,
247
- 'ffn_hidden_size': fhs,
248
- 'num_experts': ne,
249
- }
250
- log_benchmark(
251
- '1::GradW::DSD::TN',
252
- arguments,
253
- mean_t,
254
- std_t,
255
- x.nnz * hs * 2,
256
- )
257
-
258
- @parameterized.parameters(*_MATMUL_TESTS)
259
- def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
260
- assert (sl % ne) == 0
261
- x = torch.randn((ne, sl // ne, hs)).cuda().half()
262
- w = torch.randn((ne, hs, fhs)).cuda().half()
263
-
264
- w = w.transpose(1, 2).contiguous()
265
- w = w.transpose(1, 2)
266
-
267
- def benchmark():
268
- return torch.bmm(x, w)
269
-
270
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
271
- arguments = {
272
- 'sequence_length': sl,
273
- 'hidden_size': hs,
274
- 'ffn_hidden_size': fhs,
275
- 'num_experts': ne,
276
- }
277
- log_benchmark(
278
- '0::Fwd:DDD::NT',
279
- arguments,
280
- mean_t,
281
- std_t,
282
- x.numel() * fhs * 2,
283
- )
284
-
285
- @parameterized.parameters(*_MATMUL_TESTS)
286
- def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
287
- assert (sl % ne) == 0
288
- x = torch.randn((ne, sl // ne, hs)).cuda().half()
289
- w = torch.randn((ne, hs, fhs)).cuda().half()
290
- out = torch.bmm(x, w)
291
- w = w.transpose(1, 2).contiguous()
292
-
293
- def benchmark():
294
- return torch.bmm(out, w)
295
-
296
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
297
- arguments = {
298
- 'sequence_length': sl,
299
- 'hidden_size': hs,
300
- 'ffn_hidden_size': fhs,
301
- 'num_experts': ne,
302
- }
303
- log_benchmark(
304
- '0:GradX:DDD::NN',
305
- arguments,
306
- mean_t,
307
- std_t,
308
- x.numel() * fhs * 2,
309
- )
310
-
311
- @parameterized.parameters(*_MATMUL_TESTS)
312
- def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
313
- assert (sl % ne) == 0
314
- x = torch.randn((ne, sl // ne, hs)).cuda().half()
315
- w = torch.randn((ne, hs, fhs)).cuda().half()
316
- out = torch.bmm(x, w)
317
- out = out.transpose(1, 2)
318
-
319
- def benchmark():
320
- return torch.bmm(out, x)
321
-
322
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
323
- arguments = {
324
- 'sequence_length': sl,
325
- 'hidden_size': hs,
326
- 'ffn_hidden_size': fhs,
327
- 'num_experts': ne,
328
- }
329
- log_benchmark(
330
- '0:GradW:DDD::TN',
331
- arguments,
332
- mean_t,
333
- std_t,
334
- x.numel() * fhs * 2,
335
- )
336
-
337
- @parameterized.parameters(*_MATMUL_TESTS)
338
- def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
339
- assert (sl % ne) == 0
340
- x = torch.randn((ne, sl // ne, fhs)).cuda().half()
341
- w = torch.randn((ne, fhs, hs)).cuda().half()
342
-
343
- def benchmark():
344
- return torch.bmm(x, w)
345
-
346
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
347
- arguments = {
348
- 'sequence_length': sl,
349
- 'hidden_size': hs,
350
- 'ffn_hidden_size': fhs,
351
- 'num_experts': ne,
352
- }
353
- log_benchmark(
354
- '1::Fwd::DDD::NN',
355
- arguments,
356
- mean_t,
357
- std_t,
358
- x.numel() * hs * 2,
359
- )
360
-
361
- @parameterized.parameters(*_MATMUL_TESTS)
362
- def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
363
- assert (sl % ne) == 0
364
- x = torch.randn((ne, sl // ne, fhs)).cuda().half()
365
- w = torch.randn((ne, fhs, hs)).cuda().half()
366
- out = torch.bmm(x, w)
367
- w = torch.transpose(w, 1, 2)
368
-
369
- def benchmark():
370
- return torch.bmm(out, w)
371
-
372
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
373
- arguments = {
374
- 'sequence_length': sl,
375
- 'hidden_size': hs,
376
- 'ffn_hidden_size': fhs,
377
- 'num_experts': ne,
378
- }
379
- log_benchmark(
380
- '1::GradX::DDD::NT',
381
- arguments,
382
- mean_t,
383
- std_t,
384
- x.numel() * hs * 2,
385
- )
386
-
387
- @parameterized.parameters(*_MATMUL_TESTS)
388
- def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
389
- assert (sl % ne) == 0
390
- x = torch.randn((ne, sl // ne, fhs)).cuda().half()
391
- w = torch.randn((ne, fhs, hs)).cuda().half()
392
- out = torch.bmm(x, w)
393
- x = torch.transpose(x, 1, 2)
394
-
395
- def benchmark():
396
- return torch.bmm(x, out)
397
-
398
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
399
- arguments = {
400
- 'sequence_length': sl,
401
- 'hidden_size': hs,
402
- 'ffn_hidden_size': fhs,
403
- 'num_experts': ne,
404
- }
405
- log_benchmark(
406
- '1::GradW::DDD::TN',
407
- arguments,
408
- mean_t,
409
- std_t,
410
- x.numel() * hs * 2,
411
- )
412
 
413
 
414
  if __name__ == '__main__':
 
17
  from .. import stk
18
 
19
  import torch
20
+ # from absl.testing import parameterized
21
 
22
  from .. import benchmark_util, ops
23
 
 
48
  print('=' * 60)
49
 
50
 
51
+ # class MatmulBenchmark(parameterized.TestCase):
52
+ #
53
+ # def build_sparse_matrix(self, x, padded_bins, fhs, ne):
54
+ # blocking = 128
55
+ # padded_tokens, _ = x.size()
56
+ # assert padded_tokens % blocking == 0
57
+ # assert fhs % blocking == 0
58
+ #
59
+ # # Offsets for the sparse matrix. All rows have the
60
+ # # same number of nonzero blocks dictated by the
61
+ # # dimensionality of a single expert.
62
+ # block_rows = padded_tokens // blocking
63
+ # blocks_per_row = fhs // blocking
64
+ # offsets = torch.arange(
65
+ # 0,
66
+ # block_rows * blocks_per_row + 1,
67
+ # blocks_per_row,
68
+ # dtype=torch.int32,
69
+ # device=x.device,
70
+ # )
71
+ #
72
+ # # Indices for the sparse matrix. The indices for
73
+ # # the intermediate matrix are dynamic depending
74
+ # # on the mapping of tokens to experts.
75
+ # column_indices = ops.topology(
76
+ # padded_bins,
77
+ # blocking,
78
+ # block_rows,
79
+ # blocks_per_row,
80
+ # )
81
+ # data = torch.empty(
82
+ # column_indices.numel(),
83
+ # blocking,
84
+ # blocking,
85
+ # dtype=torch.float16,
86
+ # device=x.device,
87
+ # )
88
+ # shape = (padded_tokens, fhs * ne)
89
+ # row_indices = stk.ops.row_indices(shape, data, offsets, column_indices)
90
+ # return stk.Matrix(shape, data, row_indices, column_indices, offsets)
91
+ #
92
+ # def build_input_matrix(self, sl, hs, ne):
93
+ # x = torch.randn((sl, hs)).cuda().half()
94
+ #
95
+ # # Assign tokens to experts uniformly.
96
+ # top_expert = torch.arange(0, sl).cuda().int() % ne
97
+ #
98
+ # bin_ids, indices = ops.sort(top_expert)
99
+ # tokens_per_expert = ops.histogram(top_expert, ne)
100
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
101
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
102
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
103
+ # out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, 1)
104
+ # return out, padded_bins
105
+ #
106
+ # def build_weight_matrix(self, ne, hs, fhs):
107
+ # return torch.randn((hs, ne * fhs)).cuda().half()
108
+ #
109
+ # @parameterized.parameters(*_MATMUL_TESTS)
110
+ # def testFFN_Linear0_Fwd_SDD_NT(self, sl, hs, fhs, ne):
111
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
112
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
113
+ # topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
114
+ # w = transpose_view(w)
115
+ #
116
+ # def benchmark():
117
+ # return stk.ops.sdd(x, w, topo)
118
+ #
119
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
+ # arguments = {
121
+ # 'sequence_length': sl,
122
+ # 'hidden_size': hs,
123
+ # 'ffn_hidden_size': fhs,
124
+ # 'num_experts': ne,
125
+ # }
126
+ # log_benchmark(
127
+ # '0::Fwd::SDD::NT',
128
+ # arguments,
129
+ # mean_t,
130
+ # std_t,
131
+ # x.numel() * fhs * 2,
132
+ # )
133
+ #
134
+ # @parameterized.parameters(*_MATMUL_TESTS)
135
+ # def testFFN_Linear0_GradX_DSD_NN(self, sl, hs, fhs, ne):
136
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
137
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
138
+ # topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
139
+ #
140
+ # def benchmark():
141
+ # return stk.ops.dsd(topo, w)
142
+ #
143
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
144
+ # arguments = {
145
+ # 'sequence_length': sl,
146
+ # 'hidden_size': hs,
147
+ # 'ffn_hidden_size': fhs,
148
+ # 'num_experts': ne,
149
+ # }
150
+ # log_benchmark(
151
+ # '0::GradX::DSD::NN',
152
+ # arguments,
153
+ # mean_t,
154
+ # std_t,
155
+ # x.numel() * fhs * 2,
156
+ # )
157
+ #
158
+ # @parameterized.parameters(*_MATMUL_TESTS)
159
+ # def testFFN_Linear0_GradW_DSD_TN(self, sl, hs, fhs, ne):
160
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
161
+ # topo = self.build_sparse_matrix(x, padded_bins, fhs, ne)
162
+ # topo = topo.t()
163
+ #
164
+ # def benchmark():
165
+ # return stk.ops.dsd(topo, x)
166
+ #
167
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
168
+ # arguments = {
169
+ # 'sequence_length': sl,
170
+ # 'hidden_size': hs,
171
+ # 'ffn_hidden_size': fhs,
172
+ # 'num_experts': ne,
173
+ # }
174
+ # log_benchmark(
175
+ # '0::GradW::DSD::TN',
176
+ # arguments,
177
+ # mean_t,
178
+ # std_t,
179
+ # x.numel() * fhs * 2,
180
+ # )
181
+ #
182
+ # @parameterized.parameters(*_MATMUL_TESTS)
183
+ # def testFFN_Linear1_Fwd_DSD_NN(self, sl, hs, fhs, ne):
184
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
185
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
186
+ # x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
187
+ #
188
+ # def benchmark():
189
+ # return stk.ops.dsd(x, w)
190
+ #
191
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
192
+ # arguments = {
193
+ # 'sequence_length': sl,
194
+ # 'hidden_size': hs,
195
+ # 'ffn_hidden_size': fhs,
196
+ # 'num_experts': ne,
197
+ # }
198
+ # log_benchmark(
199
+ # '1::Fwd::DSD::NN',
200
+ # arguments,
201
+ # mean_t,
202
+ # std_t,
203
+ # x.nnz * hs * 2,
204
+ # )
205
+ #
206
+ # @parameterized.parameters(*_MATMUL_TESTS)
207
+ # def testFFN_Linear1_GradX_SDD_NT(self, sl, hs, fhs, ne):
208
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
209
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
210
+ # x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
211
+ # out = stk.ops.dsd(x, w)
212
+ # w = transpose_view(w)
213
+ #
214
+ # def benchmark():
215
+ # return stk.ops.sdd(out, w, x)
216
+ #
217
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
218
+ # arguments = {
219
+ # 'sequence_length': sl,
220
+ # 'hidden_size': hs,
221
+ # 'ffn_hidden_size': fhs,
222
+ # 'num_experts': ne,
223
+ # }
224
+ # log_benchmark(
225
+ # '1::GradX::SDD::NT',
226
+ # arguments,
227
+ # mean_t,
228
+ # std_t,
229
+ # x.nnz * hs * 2,
230
+ # )
231
+ #
232
+ # @parameterized.parameters(*_MATMUL_TESTS)
233
+ # def testFFN_Linear1_GradW_DSD_TN(self, sl, hs, fhs, ne):
234
+ # x, padded_bins = self.build_input_matrix(sl, hs, ne)
235
+ # w = self.build_weight_matrix(ne, hs, fhs).t().contiguous()
236
+ # x = self.build_sparse_matrix(x, padded_bins, fhs, ne)
237
+ # out = stk.ops.dsd(x, w)
238
+ # x = x.t()
239
+ #
240
+ # def benchmark():
241
+ # return stk.ops.dsd(x, out)
242
+ #
243
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
244
+ # arguments = {
245
+ # 'sequence_length': sl,
246
+ # 'hidden_size': hs,
247
+ # 'ffn_hidden_size': fhs,
248
+ # 'num_experts': ne,
249
+ # }
250
+ # log_benchmark(
251
+ # '1::GradW::DSD::TN',
252
+ # arguments,
253
+ # mean_t,
254
+ # std_t,
255
+ # x.nnz * hs * 2,
256
+ # )
257
+ #
258
+ # @parameterized.parameters(*_MATMUL_TESTS)
259
+ # def testFFN_Linear0_Fwd_DDD_NT(self, sl, hs, fhs, ne):
260
+ # assert (sl % ne) == 0
261
+ # x = torch.randn((ne, sl // ne, hs)).cuda().half()
262
+ # w = torch.randn((ne, hs, fhs)).cuda().half()
263
+ #
264
+ # w = w.transpose(1, 2).contiguous()
265
+ # w = w.transpose(1, 2)
266
+ #
267
+ # def benchmark():
268
+ # return torch.bmm(x, w)
269
+ #
270
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
271
+ # arguments = {
272
+ # 'sequence_length': sl,
273
+ # 'hidden_size': hs,
274
+ # 'ffn_hidden_size': fhs,
275
+ # 'num_experts': ne,
276
+ # }
277
+ # log_benchmark(
278
+ # '0::Fwd:DDD::NT',
279
+ # arguments,
280
+ # mean_t,
281
+ # std_t,
282
+ # x.numel() * fhs * 2,
283
+ # )
284
+ #
285
+ # @parameterized.parameters(*_MATMUL_TESTS)
286
+ # def testFFN_Linear0_GradX_DDD_NN(self, sl, hs, fhs, ne):
287
+ # assert (sl % ne) == 0
288
+ # x = torch.randn((ne, sl // ne, hs)).cuda().half()
289
+ # w = torch.randn((ne, hs, fhs)).cuda().half()
290
+ # out = torch.bmm(x, w)
291
+ # w = w.transpose(1, 2).contiguous()
292
+ #
293
+ # def benchmark():
294
+ # return torch.bmm(out, w)
295
+ #
296
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
297
+ # arguments = {
298
+ # 'sequence_length': sl,
299
+ # 'hidden_size': hs,
300
+ # 'ffn_hidden_size': fhs,
301
+ # 'num_experts': ne,
302
+ # }
303
+ # log_benchmark(
304
+ # '0:GradX:DDD::NN',
305
+ # arguments,
306
+ # mean_t,
307
+ # std_t,
308
+ # x.numel() * fhs * 2,
309
+ # )
310
+ #
311
+ # @parameterized.parameters(*_MATMUL_TESTS)
312
+ # def testFFN_Linear0_GradW_DDD_TN(self, sl, hs, fhs, ne):
313
+ # assert (sl % ne) == 0
314
+ # x = torch.randn((ne, sl // ne, hs)).cuda().half()
315
+ # w = torch.randn((ne, hs, fhs)).cuda().half()
316
+ # out = torch.bmm(x, w)
317
+ # out = out.transpose(1, 2)
318
+ #
319
+ # def benchmark():
320
+ # return torch.bmm(out, x)
321
+ #
322
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
323
+ # arguments = {
324
+ # 'sequence_length': sl,
325
+ # 'hidden_size': hs,
326
+ # 'ffn_hidden_size': fhs,
327
+ # 'num_experts': ne,
328
+ # }
329
+ # log_benchmark(
330
+ # '0:GradW:DDD::TN',
331
+ # arguments,
332
+ # mean_t,
333
+ # std_t,
334
+ # x.numel() * fhs * 2,
335
+ # )
336
+ #
337
+ # @parameterized.parameters(*_MATMUL_TESTS)
338
+ # def testFFN_Linear1_Fwd_DDD_NN(self, sl, hs, fhs, ne):
339
+ # assert (sl % ne) == 0
340
+ # x = torch.randn((ne, sl // ne, fhs)).cuda().half()
341
+ # w = torch.randn((ne, fhs, hs)).cuda().half()
342
+ #
343
+ # def benchmark():
344
+ # return torch.bmm(x, w)
345
+ #
346
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
347
+ # arguments = {
348
+ # 'sequence_length': sl,
349
+ # 'hidden_size': hs,
350
+ # 'ffn_hidden_size': fhs,
351
+ # 'num_experts': ne,
352
+ # }
353
+ # log_benchmark(
354
+ # '1::Fwd::DDD::NN',
355
+ # arguments,
356
+ # mean_t,
357
+ # std_t,
358
+ # x.numel() * hs * 2,
359
+ # )
360
+ #
361
+ # @parameterized.parameters(*_MATMUL_TESTS)
362
+ # def testFFN_Linear1_GradX_DDD_NT(self, sl, hs, fhs, ne):
363
+ # assert (sl % ne) == 0
364
+ # x = torch.randn((ne, sl // ne, fhs)).cuda().half()
365
+ # w = torch.randn((ne, fhs, hs)).cuda().half()
366
+ # out = torch.bmm(x, w)
367
+ # w = torch.transpose(w, 1, 2)
368
+ #
369
+ # def benchmark():
370
+ # return torch.bmm(out, w)
371
+ #
372
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
373
+ # arguments = {
374
+ # 'sequence_length': sl,
375
+ # 'hidden_size': hs,
376
+ # 'ffn_hidden_size': fhs,
377
+ # 'num_experts': ne,
378
+ # }
379
+ # log_benchmark(
380
+ # '1::GradX::DDD::NT',
381
+ # arguments,
382
+ # mean_t,
383
+ # std_t,
384
+ # x.numel() * hs * 2,
385
+ # )
386
+ #
387
+ # @parameterized.parameters(*_MATMUL_TESTS)
388
+ # def testFFN_Linear1_GradW_DDD_TN(self, sl, hs, fhs, ne):
389
+ # assert (sl % ne) == 0
390
+ # x = torch.randn((ne, sl // ne, fhs)).cuda().half()
391
+ # w = torch.randn((ne, fhs, hs)).cuda().half()
392
+ # out = torch.bmm(x, w)
393
+ # x = torch.transpose(x, 1, 2)
394
+ #
395
+ # def benchmark():
396
+ # return torch.bmm(x, out)
397
+ #
398
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
399
+ # arguments = {
400
+ # 'sequence_length': sl,
401
+ # 'hidden_size': hs,
402
+ # 'ffn_hidden_size': fhs,
403
+ # 'num_experts': ne,
404
+ # }
405
+ # log_benchmark(
406
+ # '1::GradW::DDD::TN',
407
+ # arguments,
408
+ # mean_t,
409
+ # std_t,
410
+ # x.numel() * hs * 2,
411
+ # )
412
 
413
 
414
  if __name__ == '__main__':
build/torch210-cxx11-cu130-x86_64-linux/ops/padded_scatter_benchmark.py CHANGED
@@ -4,7 +4,7 @@
4
  import unittest
5
 
6
  import torch
7
- from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
@@ -16,50 +16,50 @@ _PADDED_SCATTER_BENCHMARK = (
16
  )
17
 
18
 
19
- class PaddedScatterTest(parameterized.TestCase):
20
-
21
- @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
22
- def testPaddedScatter(self, sl, hs, ne, top_k):
23
- # Create the data and indices.
24
- x = torch.randn((sl, hs)).cuda().half()
25
-
26
- # Randomly assign tokens to experts.
27
- top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
28
- bin_ids, indices = ops.sort(top_expert)
29
- tokens_per_expert = ops.histogram(top_expert, ne)
30
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
31
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
32
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
33
-
34
- # Sample weights for the scatter reduce.
35
- weights = torch.rand((sl * top_k,)).cuda().half()
36
-
37
- # Gather the data to prepare for backwards.
38
- x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
39
-
40
- def benchmark():
41
- return ops.padded_scatter(
42
- x,
43
- indices,
44
- bin_ids,
45
- weights,
46
- bins,
47
- padded_bins,
48
- top_k,
49
- )
50
-
51
- time, std = benchmark_util.benchmark_function(benchmark)
52
- benchmark_util.log_benchmark(
53
- 'Padded Scatter',
54
- {
55
- 'sequence_length': sl,
56
- 'hidden_size': hs,
57
- 'num_experts': ne,
58
- 'top_k': top_k,
59
- },
60
- time,
61
- std,
62
- )
63
 
64
 
65
  if __name__ == '__main__':
 
4
  import unittest
5
 
6
  import torch
7
+ # from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
 
16
  )
17
 
18
 
19
+ # class PaddedScatterTest(parameterized.TestCase):
20
+ #
21
+ # @parameterized.parameters(*_PADDED_SCATTER_BENCHMARK)
22
+ # def testPaddedScatter(self, sl, hs, ne, top_k):
23
+ # # Create the data and indices.
24
+ # x = torch.randn((sl, hs)).cuda().half()
25
+ #
26
+ # # Randomly assign tokens to experts.
27
+ # top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
28
+ # bin_ids, indices = ops.sort(top_expert)
29
+ # tokens_per_expert = ops.histogram(top_expert, ne)
30
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
31
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
32
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
33
+ #
34
+ # # Sample weights for the scatter reduce.
35
+ # weights = torch.rand((sl * top_k,)).cuda().half()
36
+ #
37
+ # # Gather the data to prepare for backwards.
38
+ # x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
39
+ #
40
+ # def benchmark():
41
+ # return ops.padded_scatter(
42
+ # x,
43
+ # indices,
44
+ # bin_ids,
45
+ # weights,
46
+ # bins,
47
+ # padded_bins,
48
+ # top_k,
49
+ # )
50
+ #
51
+ # time, std = benchmark_util.benchmark_function(benchmark)
52
+ # benchmark_util.log_benchmark(
53
+ # 'Padded Scatter',
54
+ # {
55
+ # 'sequence_length': sl,
56
+ # 'hidden_size': hs,
57
+ # 'num_experts': ne,
58
+ # 'top_k': top_k,
59
+ # },
60
+ # time,
61
+ # std,
62
+ # )
63
 
64
 
65
  if __name__ == '__main__':
build/torch210-cxx11-cu130-x86_64-linux/ops/permute_benchmark.py CHANGED
@@ -4,7 +4,7 @@
4
  import unittest
5
 
6
  import torch
7
- from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
@@ -26,123 +26,123 @@ _PERMUTE_TESTS = (
26
  )
27
 
28
 
29
- class PermuteBenchmark(parameterized.TestCase):
30
-
31
- @parameterized.parameters(*_PERMUTE_TESTS)
32
- def testBinnedGather(self, sl, hs, ne):
33
- # NOTE: Capacity factor == 1.
34
- ec = sl // ne
35
-
36
- # Create the data and indices.
37
- x = torch.randn((sl, hs)).cuda().half()
38
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
39
- bin_ids, indices = ops.sort(top_expert)
40
- tokens_per_expert = ops.histogram(indices, ne)
41
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
42
-
43
- def benchmark():
44
- return ops.binned_gather(x, indices, bins, ec)
45
-
46
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
47
- arguments = {
48
- 'sequence_length': sl,
49
- 'hidden_size': hs,
50
- 'num_experts': ne,
51
- }
52
- benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
53
-
54
- @parameterized.parameters(*_PERMUTE_TESTS)
55
- def testBinnedScatter(self, sl, hs, ne):
56
- # NOTE: Capacity factor == 1.
57
- ec = sl // ne
58
-
59
- # Create the data and indices.
60
- x = torch.randn((sl, hs)).cuda().half()
61
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
62
- bin_ids, indices = ops.sort(top_expert)
63
- tokens_per_expert = ops.histogram(indices, ne)
64
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
65
- x = ops.binned_gather(x, indices, bins, ec)
66
-
67
- def benchmark():
68
- return ops.binned_scatter(x, indices, bins)
69
-
70
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
71
- arguments = {
72
- 'sequence_length': sl,
73
- 'hidden_size': hs,
74
- 'num_experts': ne,
75
- }
76
- benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
77
-
78
- @parameterized.parameters(*_PERMUTE_TESTS)
79
- def testPaddedGather(self, sl, hs, ne):
80
- # Create the data and indices.
81
- x = torch.randn((sl, hs)).cuda().half()
82
-
83
- # Randomly assign tokens to experts.
84
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
85
- bin_ids, indices = ops.sort(top_expert)
86
- tokens_per_expert = ops.histogram(top_expert, ne)
87
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
88
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
89
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
90
-
91
- def benchmark():
92
- return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
93
-
94
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
95
- arguments = {
96
- 'sequence_length': sl,
97
- 'hidden_size': hs,
98
- 'num_experts': ne,
99
- }
100
- benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
101
-
102
- @parameterized.parameters(*_PERMUTE_TESTS)
103
- def testPaddedScatter(self, sl, hs, ne):
104
- # Create the data and indices.
105
- x = torch.randn((sl, hs)).cuda().half()
106
-
107
- # Randomly assign tokens to experts.
108
- top_expert = torch.randint(0, ne, (sl,)).cuda().int()
109
- bin_ids, indices = ops.sort(top_expert)
110
- tokens_per_expert = ops.histogram(top_expert, ne)
111
- padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
112
- padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
113
- bins = ops.inclusive_cumsum(tokens_per_expert, 0)
114
- x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
115
-
116
- def benchmark():
117
- return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
118
-
119
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
- arguments = {
121
- 'sequence_length': sl,
122
- 'hidden_size': hs,
123
- 'num_experts': ne,
124
- }
125
- benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
126
-
127
- @parameterized.parameters(*_PERMUTE_TESTS)
128
- def testCopy(self, sl, hs, ne):
129
- # NOTE: Capacity factor == 1.
130
- # ec = sl // ne
131
-
132
- # Create the data and indices.
133
- x = torch.randn((sl, hs)).cuda().half()
134
- y = x.clone()
135
-
136
- def benchmark():
137
- return y.copy_(x)
138
-
139
- mean_t, std_t = benchmark_util.benchmark_function(benchmark)
140
- arguments = {
141
- 'sequence_length': sl,
142
- 'hidden_size': hs,
143
- 'num_experts': ne,
144
- }
145
- benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
146
 
147
 
148
  if __name__ == '__main__':
 
4
  import unittest
5
 
6
  import torch
7
+ # from absl.testing import parameterized
8
 
9
  from .. import benchmark_util, ops
10
 
 
26
  )
27
 
28
 
29
+ # class PermuteBenchmark(parameterized.TestCase):
30
+ #
31
+ # @parameterized.parameters(*_PERMUTE_TESTS)
32
+ # def testBinnedGather(self, sl, hs, ne):
33
+ # # NOTE: Capacity factor == 1.
34
+ # ec = sl // ne
35
+ #
36
+ # # Create the data and indices.
37
+ # x = torch.randn((sl, hs)).cuda().half()
38
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
39
+ # bin_ids, indices = ops.sort(top_expert)
40
+ # tokens_per_expert = ops.histogram(indices, ne)
41
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
42
+ #
43
+ # def benchmark():
44
+ # return ops.binned_gather(x, indices, bins, ec)
45
+ #
46
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
47
+ # arguments = {
48
+ # 'sequence_length': sl,
49
+ # 'hidden_size': hs,
50
+ # 'num_experts': ne,
51
+ # }
52
+ # benchmark_util.log_benchmark('BinnedGather', arguments, mean_t, std_t)
53
+ #
54
+ # @parameterized.parameters(*_PERMUTE_TESTS)
55
+ # def testBinnedScatter(self, sl, hs, ne):
56
+ # # NOTE: Capacity factor == 1.
57
+ # ec = sl // ne
58
+ #
59
+ # # Create the data and indices.
60
+ # x = torch.randn((sl, hs)).cuda().half()
61
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
62
+ # bin_ids, indices = ops.sort(top_expert)
63
+ # tokens_per_expert = ops.histogram(indices, ne)
64
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
65
+ # x = ops.binned_gather(x, indices, bins, ec)
66
+ #
67
+ # def benchmark():
68
+ # return ops.binned_scatter(x, indices, bins)
69
+ #
70
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
71
+ # arguments = {
72
+ # 'sequence_length': sl,
73
+ # 'hidden_size': hs,
74
+ # 'num_experts': ne,
75
+ # }
76
+ # benchmark_util.log_benchmark('BinnedScatter', arguments, mean_t, std_t)
77
+ #
78
+ # @parameterized.parameters(*_PERMUTE_TESTS)
79
+ # def testPaddedGather(self, sl, hs, ne):
80
+ # # Create the data and indices.
81
+ # x = torch.randn((sl, hs)).cuda().half()
82
+ #
83
+ # # Randomly assign tokens to experts.
84
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
85
+ # bin_ids, indices = ops.sort(top_expert)
86
+ # tokens_per_expert = ops.histogram(top_expert, ne)
87
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
88
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
89
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
90
+ #
91
+ # def benchmark():
92
+ # return ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
93
+ #
94
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
95
+ # arguments = {
96
+ # 'sequence_length': sl,
97
+ # 'hidden_size': hs,
98
+ # 'num_experts': ne,
99
+ # }
100
+ # benchmark_util.log_benchmark('PaddedGather', arguments, mean_t, std_t)
101
+ #
102
+ # @parameterized.parameters(*_PERMUTE_TESTS)
103
+ # def testPaddedScatter(self, sl, hs, ne):
104
+ # # Create the data and indices.
105
+ # x = torch.randn((sl, hs)).cuda().half()
106
+ #
107
+ # # Randomly assign tokens to experts.
108
+ # top_expert = torch.randint(0, ne, (sl,)).cuda().int()
109
+ # bin_ids, indices = ops.sort(top_expert)
110
+ # tokens_per_expert = ops.histogram(top_expert, ne)
111
+ # padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
112
+ # padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
113
+ # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
114
+ # x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins)
115
+ #
116
+ # def benchmark():
117
+ # return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
118
+ #
119
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
120
+ # arguments = {
121
+ # 'sequence_length': sl,
122
+ # 'hidden_size': hs,
123
+ # 'num_experts': ne,
124
+ # }
125
+ # benchmark_util.log_benchmark('PaddedScatter', arguments, mean_t, std_t)
126
+ #
127
+ # @parameterized.parameters(*_PERMUTE_TESTS)
128
+ # def testCopy(self, sl, hs, ne):
129
+ # # NOTE: Capacity factor == 1.
130
+ # # ec = sl // ne
131
+ #
132
+ # # Create the data and indices.
133
+ # x = torch.randn((sl, hs)).cuda().half()
134
+ # y = x.clone()
135
+ #
136
+ # def benchmark():
137
+ # return y.copy_(x)
138
+ #
139
+ # mean_t, std_t = benchmark_util.benchmark_function(benchmark)
140
+ # arguments = {
141
+ # 'sequence_length': sl,
142
+ # 'hidden_size': hs,
143
+ # 'num_experts': ne,
144
+ # }
145
+ # benchmark_util.log_benchmark('Copy', arguments, mean_t, std_t)
146
 
147
 
148
  if __name__ == '__main__':
build/torch210-cxx11-cu130-x86_64-linux/ops/sort_benchmark.py CHANGED
@@ -5,7 +5,7 @@ import unittest
5
 
6
  import numpy as np
7
  import torch
8
- from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
@@ -53,32 +53,32 @@ def log_benchmark(arguments, mean_t, std_t):
53
  print('=' * 60)
54
 
55
 
56
- class SortBenchmark(parameterized.TestCase):
57
-
58
- @parameterized.parameters(*_SORT_TESTS)
59
- def testSort(self, n, dtype, max_val):
60
- if max_val is None:
61
- max_val = np.iinfo(numpy_dtype(dtype)).max
62
- end_bit = int(np.ceil(np.log2(max_val)))
63
- x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
64
-
65
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
66
- arguments = {
67
- 'n': n,
68
- 'dtype': dtype,
69
- 'max_val': max_val,
70
- }
71
- log_benchmark(arguments, mean_t, std_t)
72
-
73
- @parameterized.parameters(*_BASELINE_SORT_TESTS)
74
- def testTorchSort(self, n):
75
- x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
76
-
77
- mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
78
- arguments = {
79
- 'n': n,
80
- }
81
- log_benchmark(arguments, mean_t, std_t)
82
 
83
 
84
  if __name__ == '__main__':
 
5
 
6
  import numpy as np
7
  import torch
8
+ # from absl.testing import parameterized
9
 
10
  from .. import ops
11
 
 
53
  print('=' * 60)
54
 
55
 
56
+ # class SortBenchmark(parameterized.TestCase):
57
+ #
58
+ # @parameterized.parameters(*_SORT_TESTS)
59
+ # def testSort(self, n, dtype, max_val):
60
+ # if max_val is None:
61
+ # max_val = np.iinfo(numpy_dtype(dtype)).max
62
+ # end_bit = int(np.ceil(np.log2(max_val)))
63
+ # x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
64
+ #
65
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: ops.sort(x, end_bit),)
66
+ # arguments = {
67
+ # 'n': n,
68
+ # 'dtype': dtype,
69
+ # 'max_val': max_val,
70
+ # }
71
+ # log_benchmark(arguments, mean_t, std_t)
72
+ #
73
+ # @parameterized.parameters(*_BASELINE_SORT_TESTS)
74
+ # def testTorchSort(self, n):
75
+ # x = torch.randint(0, 128, (n,)).cuda().to(torch.int32)
76
+ #
77
+ # mean_t, std_t, max_t, min_t = benchmark_function(lambda: torch.sort(x))
78
+ # arguments = {
79
+ # 'n': n,
80
+ # }
81
+ # log_benchmark(arguments, mean_t, std_t)
82
 
83
 
84
  if __name__ == '__main__':
build/torch210-cxx11-cu130-x86_64-linux/stk/ops/eltwise_ops_test.py CHANGED
@@ -1,7 +1,7 @@
1
  import unittest
2
  import itertools
3
  import torch
4
- from absl.testing import parameterized
5
 
6
  import stk
7
  from stk.ops.linear_ops_test import allclose, _dense_and_sparse
@@ -47,40 +47,40 @@ def _dense_and_sparse_like(x, std=0.1):
47
  return (dense.requires_grad_(True),
48
  sparse.requires_grad_(True))
49
 
50
- @parameterized.parameters(_ELTWISE_OP_TESTS)
51
- class EltwiseOpsTest(parameterized.TestCase):
52
-
53
- def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
54
-
55
- a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
56
- b_dense, b = _dense_and_sparse_like(a)
57
-
58
- out = stk.ops.mul(a, b)
59
- expected_out = torch.mul(a_dense, b_dense)
60
-
61
- # Compute the gradients w.r.t. the inputs.
62
- expected_out.sum().backward()
63
- stk.ops.sum(out).backward()
64
-
65
- # Validate the results.
66
- out = stk.ops.to_dense(out)
67
- self.assertEqual(out.dim(), 2)
68
- self.assertEqual(expected_out.size(), out.size())
69
- self.assertTrue(allclose(out, expected_out))
70
-
71
- # LHS gradient.
72
- grad = stk.ops.to_dense(a.grad)
73
- expected_grad = a_dense.grad
74
- self.assertEqual(grad.dim(), 2)
75
- self.assertEqual(expected_grad.size(), grad.size())
76
- self.assertTrue(allclose(grad, expected_grad))
77
-
78
- # RHS gradient.
79
- grad = stk.ops.to_dense(b.grad)
80
- expected_grad = b_dense.grad
81
- self.assertEqual(grad.dim(), 2)
82
- self.assertEqual(expected_grad.size(), grad.size())
83
- self.assertTrue(allclose(grad, expected_grad))
84
 
85
  if __name__ == '__main__':
86
  unittest.main()
 
1
  import unittest
2
  import itertools
3
  import torch
4
+ # from absl.testing import parameterized
5
 
6
  import stk
7
  from stk.ops.linear_ops_test import allclose, _dense_and_sparse
 
47
  return (dense.requires_grad_(True),
48
  sparse.requires_grad_(True))
49
 
50
+ # @parameterized.parameters(_ELTWISE_OP_TESTS)
51
+ # class EltwiseOpsTest(parameterized.TestCase):
52
+ #
53
+ # def testEltwiseMul(self, m, n, sparsity, blocking, dtype):
54
+ #
55
+ # a_dense, a = _dense_and_sparse(m, n, sparsity, blocking, dtype)
56
+ # b_dense, b = _dense_and_sparse_like(a)
57
+ #
58
+ # out = stk.ops.mul(a, b)
59
+ # expected_out = torch.mul(a_dense, b_dense)
60
+ #
61
+ # # Compute the gradients w.r.t. the inputs.
62
+ # expected_out.sum().backward()
63
+ # stk.ops.sum(out).backward()
64
+ #
65
+ # # Validate the results.
66
+ # out = stk.ops.to_dense(out)
67
+ # self.assertEqual(out.dim(), 2)
68
+ # self.assertEqual(expected_out.size(), out.size())
69
+ # self.assertTrue(allclose(out, expected_out))
70
+ #
71
+ # # LHS gradient.
72
+ # grad = stk.ops.to_dense(a.grad)
73
+ # expected_grad = a_dense.grad
74
+ # self.assertEqual(grad.dim(), 2)
75
+ # self.assertEqual(expected_grad.size(), grad.size())
76
+ # self.assertTrue(allclose(grad, expected_grad))
77
+ #
78
+ # # RHS gradient.
79
+ # grad = stk.ops.to_dense(b.grad)
80
+ # expected_grad = b_dense.grad
81
+ # self.assertEqual(grad.dim(), 2)
82
+ # self.assertEqual(expected_grad.size(), grad.size())
83
+ # self.assertTrue(allclose(grad, expected_grad))
84
 
85
  if __name__ == '__main__':
86
  unittest.main()
build/torch210-cxx11-cu130-x86_64-linux/stk/ops/linear_ops_test.py CHANGED
@@ -2,7 +2,7 @@ import unittest
2
  import itertools
3
  import numpy as np
4
  import torch
5
- from absl.testing import parameterized
6
 
7
  import stk
8
 
@@ -96,121 +96,121 @@ def _mask(x, mask):
96
  return x * mask
97
 
98
 
99
- @parameterized.parameters(*_LINEAR_OP_TESTS)
100
- class LinearOpsTest(parameterized.TestCase):
101
-
102
- def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
103
- # Construct the operands.
104
- a_shape = (k, m) if trans_a else (m, k)
105
- a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
106
- b_shape = (n, k) if trans_b else (k, n)
107
- b, bcp = _dense_2x(*b_shape, dtype)
108
-
109
- # Execute the matmul.
110
- out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
111
- expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
112
-
113
- # Compute the gradients w.r.t. the inputs.
114
- expected_out.sum().backward()
115
- out.sum().backward()
116
-
117
- # Validate the results.
118
- self.assertEqual(out.dim(), 2)
119
- self.assertEqual(expected_out.size()[0], out.size()[0])
120
- self.assertEqual(expected_out.size()[1], out.size()[1])
121
- self.assertTrue(allclose(out, expected_out))
122
-
123
- # LHS gradient.
124
- grad = stk.ops.to_dense(a.grad)
125
- expected_grad = _mask(a_dense.grad, a.grad)
126
- self.assertEqual(grad.dim(), 2)
127
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
128
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
129
- self.assertTrue(allclose(grad, expected_grad))
130
-
131
- # RHS gradient.
132
- grad = b.grad
133
- expected_grad = bcp.grad
134
- self.assertEqual(grad.dim(), 2)
135
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
136
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
137
- self.assertTrue(allclose(grad, expected_grad))
138
-
139
- def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
140
- # Construct the operands.
141
- a_shape = (k, m) if trans_a else (m, k)
142
- a, acp = _dense_2x(*a_shape, dtype)
143
- b_shape = (n, k) if trans_b else (k, n)
144
- b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
145
-
146
- # Execute the matmul.
147
- out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
148
- expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
149
-
150
- # Compute the gradients w.r.t. the inputs.
151
- expected_out.sum().backward()
152
- out.sum().backward()
153
-
154
- # Validate the results.
155
- self.assertEqual(out.dim(), 2)
156
- self.assertEqual(expected_out.size()[0], out.size()[0])
157
- self.assertEqual(expected_out.size()[1], out.size()[1])
158
- self.assertTrue(allclose(out, expected_out))
159
-
160
- # LHS gradient.
161
- grad = a.grad
162
- expected_grad = acp.grad
163
- self.assertEqual(grad.dim(), 2)
164
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
165
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
166
- self.assertTrue(allclose(grad, expected_grad))
167
-
168
- # RHS gradient.
169
- grad = stk.ops.to_dense(b.grad)
170
- expected_grad = _mask(b_dense.grad, b.grad)
171
- self.assertEqual(grad.dim(), 2)
172
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
173
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
174
- self.assertTrue(allclose(grad, expected_grad))
175
-
176
- def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
177
- # Construct the operands.
178
- a_shape = (k, m) if trans_a else (m, k)
179
- a, acp = _dense_2x(*a_shape, dtype)
180
- b_shape = (n, k) if trans_b else (k, n)
181
- b, bcp = _dense_2x(*b_shape, dtype)
182
- _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
183
-
184
- # Execute the matmul.
185
- out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
186
- expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
187
-
188
- # Compute the gradients w.r.t. the inputs.
189
- expected_out.sum().backward()
190
- stk.ops.sum(out).backward()
191
-
192
- # Validate the results.
193
- out = stk.ops.to_dense(out)
194
- self.assertEqual(out.dim(), 2)
195
- self.assertEqual(expected_out.size()[0], out.size()[0])
196
- self.assertEqual(expected_out.size()[1], out.size()[1])
197
- self.assertTrue(allclose(out, expected_out))
198
-
199
- # LHS gradient.
200
- grad = a.grad
201
- expected_grad = acp.grad
202
- self.assertEqual(grad.dim(), 2)
203
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
204
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
205
- self.assertTrue(allclose(grad, expected_grad))
206
-
207
- # RHS gradient.
208
- grad = b.grad
209
- expected_grad = bcp.grad
210
- self.assertEqual(grad.dim(), 2)
211
- self.assertEqual(expected_grad.size()[0], grad.size()[0])
212
- self.assertEqual(expected_grad.size()[1], grad.size()[1])
213
- self.assertTrue(allclose(grad, expected_grad))
214
 
215
  if __name__ == '__main__':
216
  unittest.main()
 
2
  import itertools
3
  import numpy as np
4
  import torch
5
+ # from absl.testing import parameterized
6
 
7
  import stk
8
 
 
96
  return x * mask
97
 
98
 
99
+ # @parameterized.parameters(*_LINEAR_OP_TESTS)
100
+ # class LinearOpsTest(parameterized.TestCase):
101
+ #
102
+ # def testLinearOps_Dsd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
103
+ # # Construct the operands.
104
+ # a_shape = (k, m) if trans_a else (m, k)
105
+ # a_dense, a = _dense_and_sparse(*a_shape, sparsity, blocking, dtype)
106
+ # b_shape = (n, k) if trans_b else (k, n)
107
+ # b, bcp = _dense_2x(*b_shape, dtype)
108
+ #
109
+ # # Execute the matmul.
110
+ # out = _with_transpose(stk.ops.dsd, a, b, trans_a, trans_b)
111
+ # expected_out = _with_transpose(torch.mm, a_dense, bcp, trans_a, trans_b)
112
+ #
113
+ # # Compute the gradients w.r.t. the inputs.
114
+ # expected_out.sum().backward()
115
+ # out.sum().backward()
116
+ #
117
+ # # Validate the results.
118
+ # self.assertEqual(out.dim(), 2)
119
+ # self.assertEqual(expected_out.size()[0], out.size()[0])
120
+ # self.assertEqual(expected_out.size()[1], out.size()[1])
121
+ # self.assertTrue(allclose(out, expected_out))
122
+ #
123
+ # # LHS gradient.
124
+ # grad = stk.ops.to_dense(a.grad)
125
+ # expected_grad = _mask(a_dense.grad, a.grad)
126
+ # self.assertEqual(grad.dim(), 2)
127
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
128
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
129
+ # self.assertTrue(allclose(grad, expected_grad))
130
+ #
131
+ # # RHS gradient.
132
+ # grad = b.grad
133
+ # expected_grad = bcp.grad
134
+ # self.assertEqual(grad.dim(), 2)
135
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
136
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
137
+ # self.assertTrue(allclose(grad, expected_grad))
138
+ #
139
+ # def testLinearOps_Dds(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
140
+ # # Construct the operands.
141
+ # a_shape = (k, m) if trans_a else (m, k)
142
+ # a, acp = _dense_2x(*a_shape, dtype)
143
+ # b_shape = (n, k) if trans_b else (k, n)
144
+ # b_dense, b = _dense_and_sparse(*b_shape, sparsity, blocking, dtype)
145
+ #
146
+ # # Execute the matmul.
147
+ # out = _with_transpose(stk.ops.dds, a, b, trans_a, trans_b)
148
+ # expected_out = _with_transpose(torch.mm, acp, b_dense, trans_a, trans_b)
149
+ #
150
+ # # Compute the gradients w.r.t. the inputs.
151
+ # expected_out.sum().backward()
152
+ # out.sum().backward()
153
+ #
154
+ # # Validate the results.
155
+ # self.assertEqual(out.dim(), 2)
156
+ # self.assertEqual(expected_out.size()[0], out.size()[0])
157
+ # self.assertEqual(expected_out.size()[1], out.size()[1])
158
+ # self.assertTrue(allclose(out, expected_out))
159
+ #
160
+ # # LHS gradient.
161
+ # grad = a.grad
162
+ # expected_grad = acp.grad
163
+ # self.assertEqual(grad.dim(), 2)
164
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
165
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
166
+ # self.assertTrue(allclose(grad, expected_grad))
167
+ #
168
+ # # RHS gradient.
169
+ # grad = stk.ops.to_dense(b.grad)
170
+ # expected_grad = _mask(b_dense.grad, b.grad)
171
+ # self.assertEqual(grad.dim(), 2)
172
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
173
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
174
+ # self.assertTrue(allclose(grad, expected_grad))
175
+ #
176
+ # def testLinearOps_Sdd(self, m, k, n, sparsity, trans_a, trans_b, blocking, dtype):
177
+ # # Construct the operands.
178
+ # a_shape = (k, m) if trans_a else (m, k)
179
+ # a, acp = _dense_2x(*a_shape, dtype)
180
+ # b_shape = (n, k) if trans_b else (k, n)
181
+ # b, bcp = _dense_2x(*b_shape, dtype)
182
+ # _, topo = _dense_and_sparse(m, n, sparsity, blocking, dtype)
183
+ #
184
+ # # Execute the matmul.
185
+ # out = _sparse_out_with_transpose(stk.ops.sdd, a, b, topo, trans_a, trans_b)
186
+ # expected_out = _sparse_out_with_transpose(_mmm, acp, bcp, topo, trans_a, trans_b)
187
+ #
188
+ # # Compute the gradients w.r.t. the inputs.
189
+ # expected_out.sum().backward()
190
+ # stk.ops.sum(out).backward()
191
+ #
192
+ # # Validate the results.
193
+ # out = stk.ops.to_dense(out)
194
+ # self.assertEqual(out.dim(), 2)
195
+ # self.assertEqual(expected_out.size()[0], out.size()[0])
196
+ # self.assertEqual(expected_out.size()[1], out.size()[1])
197
+ # self.assertTrue(allclose(out, expected_out))
198
+ #
199
+ # # LHS gradient.
200
+ # grad = a.grad
201
+ # expected_grad = acp.grad
202
+ # self.assertEqual(grad.dim(), 2)
203
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
204
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
205
+ # self.assertTrue(allclose(grad, expected_grad))
206
+ #
207
+ # # RHS gradient.
208
+ # grad = b.grad
209
+ # expected_grad = bcp.grad
210
+ # self.assertEqual(grad.dim(), 2)
211
+ # self.assertEqual(expected_grad.size()[0], grad.size()[0])
212
+ # self.assertEqual(expected_grad.size()[1], grad.size()[1])
213
+ # self.assertTrue(allclose(grad, expected_grad))
214
 
215
  if __name__ == '__main__':
216
  unittest.main()
build/torch210-cxx11-cu130-x86_64-linux/stk/ops/matrix_ops_test.py CHANGED
@@ -1,61 +1,61 @@
1
  import unittest
2
 
3
- from absl.testing import parameterized
4
  import stk
5
  import torch
6
 
7
 
8
- @parameterized.parameters(
9
- (8, 16, 0.0, 1),
10
- (8, 16, 0.5, 1),
11
- (8, 16, .95, 1),
12
- (16, 8, 0.0, 1),
13
- (16, 8, 0.5, 1),
14
- (16, 8, .95, 1),
15
- (8, 16, 0.0, 8),
16
- (8, 16, 0.5, 8),
17
- (8, 16, 1.0, 8),
18
- (16, 8, 0.0, 8),
19
- (16, 8, 0.5, 8),
20
- (16, 8, 1.0, 8),
21
- (128, 256, 0.5, 16),
22
- (256, 128, 0.75, 32),
23
- (512, 512, .875, 128))
24
- class MatrixOpsTest(parameterized.TestCase):
25
-
26
- def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
27
- mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
28
- x = (torch.randn(rows, cols) * mask).type(torch.float16)
29
-
30
- # Convert the matrix to sparse format.
31
- sparse_x = stk.ops.to_sparse(x, blocking)
32
-
33
- # Validate the matrix.
34
- sparse_x.validate()
35
-
36
- # Validate the shape.
37
- self.assertEqual(sparse_x.dim(), 2)
38
- self.assertEqual(sparse_x.size()[0], rows)
39
- self.assertEqual(sparse_x.size()[1], cols)
40
-
41
- # Validate the sparsity.
42
- numblocks = rows // blocking * cols // blocking
43
- nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
44
- self.assertEqual(sparse_x.nnz, nnz)
45
-
46
- # Convert back to dense format.
47
- dense_x = stk.ops.to_dense(sparse_x)
48
-
49
- # Validate the shape.
50
- self.assertEqual(dense_x.dim(), 2)
51
- self.assertEqual(dense_x.size()[0], rows)
52
- self.assertEqual(dense_x.size()[1], cols)
53
-
54
- # Validate the sparsity
55
- self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
56
-
57
- # Validate the output.
58
- self.assertTrue(torch.all(torch.eq(x, dense_x)))
59
 
60
 
61
  if __name__ == '__main__':
 
1
  import unittest
2
 
3
+ # from absl.testing import parameterized
4
  import stk
5
  import torch
6
 
7
 
8
+ # @parameterized.parameters(
9
+ # (8, 16, 0.0, 1),
10
+ # (8, 16, 0.5, 1),
11
+ # (8, 16, .95, 1),
12
+ # (16, 8, 0.0, 1),
13
+ # (16, 8, 0.5, 1),
14
+ # (16, 8, .95, 1),
15
+ # (8, 16, 0.0, 8),
16
+ # (8, 16, 0.5, 8),
17
+ # (8, 16, 1.0, 8),
18
+ # (16, 8, 0.0, 8),
19
+ # (16, 8, 0.5, 8),
20
+ # (16, 8, 1.0, 8),
21
+ # (128, 256, 0.5, 16),
22
+ # (256, 128, 0.75, 32),
23
+ # (512, 512, .875, 128))
24
+ # class MatrixOpsTest(parameterized.TestCase):
25
+ #
26
+ # def testMatrixOps_FormatConversion(self, rows, cols, sparsity, blocking):
27
+ # mask = stk.random.dense_mask(rows, cols, sparsity, blocking)
28
+ # x = (torch.randn(rows, cols) * mask).type(torch.float16)
29
+ #
30
+ # # Convert the matrix to sparse format.
31
+ # sparse_x = stk.ops.to_sparse(x, blocking)
32
+ #
33
+ # # Validate the matrix.
34
+ # sparse_x.validate()
35
+ #
36
+ # # Validate the shape.
37
+ # self.assertEqual(sparse_x.dim(), 2)
38
+ # self.assertEqual(sparse_x.size()[0], rows)
39
+ # self.assertEqual(sparse_x.size()[1], cols)
40
+ #
41
+ # # Validate the sparsity.
42
+ # numblocks = rows // blocking * cols // blocking
43
+ # nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
44
+ # self.assertEqual(sparse_x.nnz, nnz)
45
+ #
46
+ # # Convert back to dense format.
47
+ # dense_x = stk.ops.to_dense(sparse_x)
48
+ #
49
+ # # Validate the shape.
50
+ # self.assertEqual(dense_x.dim(), 2)
51
+ # self.assertEqual(dense_x.size()[0], rows)
52
+ # self.assertEqual(dense_x.size()[1], cols)
53
+ #
54
+ # # Validate the sparsity
55
+ # self.assertEqual(torch.count_nonzero(dense_x).item(), nnz)
56
+ #
57
+ # # Validate the output.
58
+ # self.assertTrue(torch.all(torch.eq(x, dense_x)))
59
 
60
 
61
  if __name__ == '__main__':
build/torch210-cxx11-cu130-x86_64-linux/stk/random/random_ops_test.py CHANGED
@@ -1,72 +1,72 @@
1
  import unittest
2
 
3
- from absl.testing import parameterized
4
  from . import random
5
  import torch
6
 
7
 
8
- @parameterized.parameters(
9
- (8, 16, 0.0, 1),
10
- (8, 16, 0.5, 1),
11
- (8, 16, .95, 1),
12
- (16, 8, 0.0, 1),
13
- (16, 8, 0.5, 1),
14
- (16, 8, .95, 1),
15
- (8, 16, 0.0, 8),
16
- (8, 16, 0.5, 8),
17
- (8, 16, 1.0, 8),
18
- (16, 8, 0.0, 8),
19
- (16, 8, 0.5, 8),
20
- (16, 8, 1.0, 8),
21
- (128, 256, 0.5, 16),
22
- (256, 128, 0.75, 32),
23
- (512, 512, .875, 128))
24
- class RandomOpsTest(parameterized.TestCase):
25
-
26
- def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
27
- mask = random.dense_mask(
28
- rows, cols, sparsity, blocking)
29
-
30
- # Validate the shape.
31
- self.assertEqual(mask.dim(), 2)
32
- self.assertEqual(mask.size()[0], rows)
33
- self.assertEqual(mask.size()[1], cols)
34
-
35
- # Validate the sparsity
36
- numblocks = rows // blocking * cols // blocking
37
- nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
38
- self.assertEqual(
39
- torch.count_nonzero(mask).item(),
40
- nnz)
41
-
42
- # Check values are zero or one.
43
- self.assertTrue(
44
- torch.all(torch.logical_or(
45
- torch.eq(mask, 0),
46
- torch.eq(mask, 1))))
47
-
48
- def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
49
- mask = random.mask(
50
- rows, cols, sparsity, blocking)
51
-
52
- # Validate the matrix.
53
- mask.validate()
54
-
55
- # Validate the shape.
56
- self.assertEqual(mask.dim(), 2)
57
- self.assertEqual(mask.size()[0], rows)
58
- self.assertEqual(mask.size()[1], cols)
59
-
60
- # Validate the sparsity.
61
- numblocks = rows // blocking * cols // blocking
62
- nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
63
- self.assertEqual(mask.nnz, nnz)
64
-
65
- # Check values are zero or one.
66
- self.assertTrue(
67
- torch.all(torch.logical_or(
68
- torch.eq(mask.data, 0),
69
- torch.eq(mask.data, 1))))
70
 
71
 
72
  if __name__ == '__main__':
 
1
  import unittest
2
 
3
+ # from absl.testing import parameterized
4
  from . import random
5
  import torch
6
 
7
 
8
+ # @parameterized.parameters(
9
+ # (8, 16, 0.0, 1),
10
+ # (8, 16, 0.5, 1),
11
+ # (8, 16, .95, 1),
12
+ # (16, 8, 0.0, 1),
13
+ # (16, 8, 0.5, 1),
14
+ # (16, 8, .95, 1),
15
+ # (8, 16, 0.0, 8),
16
+ # (8, 16, 0.5, 8),
17
+ # (8, 16, 1.0, 8),
18
+ # (16, 8, 0.0, 8),
19
+ # (16, 8, 0.5, 8),
20
+ # (16, 8, 1.0, 8),
21
+ # (128, 256, 0.5, 16),
22
+ # (256, 128, 0.75, 32),
23
+ # (512, 512, .875, 128))
24
+ # class RandomOpsTest(parameterized.TestCase):
25
+ #
26
+ # def testRandomOps_DenseMask(self, rows, cols, sparsity, blocking):
27
+ # mask = random.dense_mask(
28
+ # rows, cols, sparsity, blocking)
29
+ #
30
+ # # Validate the shape.
31
+ # self.assertEqual(mask.dim(), 2)
32
+ # self.assertEqual(mask.size()[0], rows)
33
+ # self.assertEqual(mask.size()[1], cols)
34
+ #
35
+ # # Validate the sparsity
36
+ # numblocks = rows // blocking * cols // blocking
37
+ # nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
38
+ # self.assertEqual(
39
+ # torch.count_nonzero(mask).item(),
40
+ # nnz)
41
+ #
42
+ # # Check values are zero or one.
43
+ # self.assertTrue(
44
+ # torch.all(torch.logical_or(
45
+ # torch.eq(mask, 0),
46
+ # torch.eq(mask, 1))))
47
+ #
48
+ # def testRandomOps_SparseMask(self, rows, cols, sparsity, blocking):
49
+ # mask = random.mask(
50
+ # rows, cols, sparsity, blocking)
51
+ #
52
+ # # Validate the matrix.
53
+ # mask.validate()
54
+ #
55
+ # # Validate the shape.
56
+ # self.assertEqual(mask.dim(), 2)
57
+ # self.assertEqual(mask.size()[0], rows)
58
+ # self.assertEqual(mask.size()[1], cols)
59
+ #
60
+ # # Validate the sparsity.
61
+ # numblocks = rows // blocking * cols // blocking
62
+ # nnz = round(numblocks * (1 - sparsity)) * blocking ** 2
63
+ # self.assertEqual(mask.nnz, nnz)
64
+ #
65
+ # # Check values are zero or one.
66
+ # self.assertTrue(
67
+ # torch.all(torch.logical_or(
68
+ # torch.eq(mask.data, 0),
69
+ # torch.eq(mask.data, 1))))
70
 
71
 
72
  if __name__ == '__main__':
build/torch210-cxx11-xpu20253-x86_64-linux/{_megablocks_xpu_6e04dec.abi3.so → _megablocks_xpu_a45325d.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46cfa6050944b0bd6daeaf4848fe5393a68397ae29a5c7f0a04280e287cb0e7d
3
  size 5381760
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:929e28d44de28c212187ee2c71b8427c84a7372157ee7bc815e7e0e1941a9f40
3
  size 5381760