drbh commited on
Commit
2059e46
·
unverified ·
0 Parent(s):

Migrated from kernels-community/mra

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +116 -0
  2. README.md +16 -0
  3. benchmarks/benchmark.py +128 -0
  4. build.toml +20 -0
  5. build/torch210-cu128-x86_64-windows/__init__.py +25 -0
  6. build/torch210-cu128-x86_64-windows/_mra_cuda_6ec000c.pyd +3 -0
  7. build/torch210-cu128-x86_64-windows/_ops.py +9 -0
  8. build/torch210-cu128-x86_64-windows/metadata.json +20 -0
  9. build/torch210-cu128-x86_64-windows/mra/__init__.py +26 -0
  10. build/torch210-cxx11-cu126-aarch64-linux/__init__.py +25 -0
  11. build/torch210-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
  12. build/torch210-cxx11-cu126-aarch64-linux/_ops.py +9 -0
  13. build/torch210-cxx11-cu126-aarch64-linux/metadata.json +17 -0
  14. build/torch210-cxx11-cu126-aarch64-linux/mra/__init__.py +26 -0
  15. build/torch210-cxx11-cu126-x86_64-linux/__init__.py +25 -0
  16. build/torch210-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
  17. build/torch210-cxx11-cu126-x86_64-linux/_ops.py +9 -0
  18. build/torch210-cxx11-cu126-x86_64-linux/metadata.json +17 -0
  19. build/torch210-cxx11-cu126-x86_64-linux/mra/__init__.py +26 -0
  20. build/torch210-cxx11-cu128-aarch64-linux/__init__.py +25 -0
  21. build/torch210-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
  22. build/torch210-cxx11-cu128-aarch64-linux/_ops.py +9 -0
  23. build/torch210-cxx11-cu128-aarch64-linux/metadata.json +20 -0
  24. build/torch210-cxx11-cu128-aarch64-linux/mra/__init__.py +26 -0
  25. build/torch210-cxx11-cu128-x86_64-linux/__init__.py +25 -0
  26. build/torch210-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
  27. build/torch210-cxx11-cu128-x86_64-linux/_ops.py +9 -0
  28. build/torch210-cxx11-cu128-x86_64-linux/metadata.json +20 -0
  29. build/torch210-cxx11-cu128-x86_64-linux/mra/__init__.py +26 -0
  30. build/torch210-cxx11-cu130-aarch64-linux/__init__.py +25 -0
  31. build/torch210-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
  32. build/torch210-cxx11-cu130-aarch64-linux/_ops.py +9 -0
  33. build/torch210-cxx11-cu130-aarch64-linux/metadata.json +18 -0
  34. build/torch210-cxx11-cu130-aarch64-linux/mra/__init__.py +26 -0
  35. build/torch210-cxx11-cu130-x86_64-linux/__init__.py +25 -0
  36. build/torch210-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
  37. build/torch210-cxx11-cu130-x86_64-linux/_ops.py +9 -0
  38. build/torch210-cxx11-cu130-x86_64-linux/metadata.json +18 -0
  39. build/torch210-cxx11-cu130-x86_64-linux/mra/__init__.py +26 -0
  40. build/torch211-cxx11-cu126-aarch64-linux/__init__.py +25 -0
  41. build/torch211-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
  42. build/torch211-cxx11-cu126-aarch64-linux/_ops.py +9 -0
  43. build/torch211-cxx11-cu126-aarch64-linux/metadata.json +17 -0
  44. build/torch211-cxx11-cu126-aarch64-linux/mra/__init__.py +26 -0
  45. build/torch211-cxx11-cu126-x86_64-linux/__init__.py +25 -0
  46. build/torch211-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
  47. build/torch211-cxx11-cu126-x86_64-linux/_ops.py +9 -0
  48. build/torch211-cxx11-cu126-x86_64-linux/metadata.json +17 -0
  49. build/torch211-cxx11-cu126-x86_64-linux/mra/__init__.py +26 -0
  50. build/torch211-cxx11-cu128-aarch64-linux/__init__.py +25 -0
.gitattributes ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ build/torch27-cxx11-cu118-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
37
+ build/torch27-cxx11-cu126-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
38
+ build/torch27-cxx11-cu128-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
39
+ build/torch28-cxx11-cu126-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
40
+ build/torch28-cxx11-cu128-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
41
+ build/torch28-cxx11-cu129-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
42
+ build/torch29-cxx11-cu126-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
43
+ build/torch29-cxx11-cu128-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
44
+ build/torch29-cxx11-cu130-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
45
+ build/torch27-cxx11-cu118-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
46
+ build/torch27-cxx11-cu126-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
47
+ build/torch27-cxx11-cu128-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
48
+ build/torch28-cxx11-cu126-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
49
+ build/torch28-cxx11-cu128-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
50
+ build/torch28-cxx11-cu129-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
51
+ build/torch29-cxx11-cu126-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
52
+ build/torch29-cxx11-cu128-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
53
+ build/torch29-cxx11-cu130-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
54
+ build/torch210-cxx11-cu126-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
55
+ build/torch210-cxx11-cu128-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
56
+ build/torch210-cxx11-cu130-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
57
+ build/torch28-cxx11-cu126-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
58
+ build/torch28-cxx11-cu128-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
59
+ build/torch28-cxx11-cu129-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
60
+ build/torch29-cxx11-cu126-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
61
+ build/torch29-cxx11-cu128-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
62
+ build/torch29-cxx11-cu130-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
63
+ build/torch210-cxx11-cu126-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
64
+ build/torch210-cxx11-cu128-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
65
+ build/torch210-cxx11-cu130-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
66
+ build/torch28-cxx11-cu126-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
67
+ build/torch28-cxx11-cu128-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
68
+ build/torch28-cxx11-cu129-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
69
+ build/torch29-cxx11-cu126-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
70
+ build/torch29-cxx11-cu128-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
71
+ build/torch29-cxx11-cu130-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
72
+ build/torch210-cxx11-cu126-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
73
+ build/torch210-cxx11-cu128-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
74
+ build/torch210-cxx11-cu130-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
75
+ build/torch28-cxx11-cu126-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
76
+ build/torch28-cxx11-cu128-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
77
+ build/torch28-cxx11-cu129-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
78
+ build/torch29-cxx11-cu126-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
79
+ build/torch29-cxx11-cu128-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
80
+ build/torch29-cxx11-cu130-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
81
+ build/torch210-cxx11-cu126-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
82
+ build/torch210-cxx11-cu128-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
83
+ build/torch210-cxx11-cu130-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
84
+ build/torch28-cxx11-cu126-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
85
+ build/torch28-cxx11-cu128-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
86
+ build/torch28-cxx11-cu129-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
87
+ build/torch29-cxx11-cu126-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
88
+ build/torch29-cxx11-cu128-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
89
+ build/torch29-cxx11-cu130-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
90
+ build/torch210-cxx11-cu126-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
91
+ build/torch210-cxx11-cu128-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
92
+ build/torch210-cxx11-cu130-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
93
+ build/torch29-cxx11-cu126-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
94
+ build/torch29-cxx11-cu128-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
95
+ build/torch29-cxx11-cu130-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
96
+ build/torch210-cxx11-cu126-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
97
+ build/torch210-cxx11-cu128-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
98
+ build/torch210-cxx11-cu130-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
99
+ build/torch29-cxx11-cu126-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
100
+ build/torch29-cxx11-cu128-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
101
+ build/torch29-cxx11-cu130-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
102
+ build/torch210-cu128-x86_64-windows/_mra_cuda_6ec000c.pyd filter=lfs diff=lfs merge=lfs -text
103
+ build/torch210-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
104
+ build/torch210-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
105
+ build/torch210-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
106
+ build/torch211-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
107
+ build/torch211-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
108
+ build/torch211-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
109
+ build/torch29-cxx11-cu129-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
110
+ build/torch210-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
111
+ build/torch210-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
112
+ build/torch210-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
113
+ build/torch211-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
114
+ build/torch211-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
115
+ build/torch211-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
116
+ build/torch29-cxx11-cu129-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - kernels
4
+ - cuda
5
+ ---
6
+ MRA kernels for transformers
7
+ ### Performance
8
+
9
+ <img class="dark:hidden border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_light_animation.svg" />
10
+ <img class="hidden dark:block border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_dark_animation.svg" />
11
+
12
+ <img class="dark:hidden border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_light_latency.svg" />
13
+ <img class="hidden dark:block border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_dark_latency.svg" />
14
+
15
+ <img class="dark:hidden border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_light_throughput.svg" />
16
+ <img class="hidden dark:block border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_dark_throughput.svg" />
benchmarks/benchmark.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from kernels.benchmark import Benchmark
4
+
5
+
6
+ def mm_to_sparse_reference(
7
+ dense_A: torch.Tensor,
8
+ dense_B: torch.Tensor,
9
+ indices: torch.Tensor,
10
+ ) -> torch.Tensor:
11
+ batch_size = dense_A.size(0)
12
+ A_num_block = dense_A.size(1)
13
+ B_num_block = dense_B.size(1)
14
+ dim = dense_A.size(2)
15
+ num_block = indices.size(1)
16
+
17
+ # Output: (batch_size, num_block, 32, 32)
18
+ sparse_C = torch.zeros(
19
+ batch_size, num_block, 32, 32, device=dense_A.device, dtype=dense_A.dtype
20
+ )
21
+
22
+ for b in range(batch_size):
23
+ for blk in range(num_block):
24
+ AB_idx = indices[b, blk].item()
25
+ A_idx = AB_idx // B_num_block
26
+ B_idx = AB_idx % B_num_block
27
+
28
+ A_block = dense_A[b, A_idx] # (dim, 32)
29
+ B_block = dense_B[b, B_idx] # (dim, 32)
30
+
31
+ # Kernel computes C = B.T @ A: (32, dim) @ (dim, 32) = (32, 32)
32
+ sparse_C[b, blk] = B_block.T @ A_block
33
+
34
+ return sparse_C
35
+
36
+
37
+ class MRABenchmark(Benchmark):
38
+ seed: int = 42
39
+
40
+ def setup(self):
41
+ # Config matching the kernel's expected format
42
+ batch_size = 2
43
+ num_heads = 8
44
+ head_dim = 64
45
+ block_size = 32 # Fixed by kernel
46
+
47
+ A_num_block = 4
48
+ B_num_block = 4
49
+ total_blocks = A_num_block * B_num_block
50
+ indices_per_block = 4 # Must be divisible by 4
51
+
52
+ self.batch_heads = batch_size * num_heads
53
+
54
+ # dense_A: [batch_size, A_num_block, dim, 32]
55
+ self.dense_a = torch.randn(
56
+ self.batch_heads,
57
+ A_num_block,
58
+ head_dim,
59
+ block_size,
60
+ device=self.device,
61
+ dtype=torch.float32,
62
+ )
63
+ # dense_B: [batch_size, B_num_block, dim, 32]
64
+ self.dense_b = torch.randn(
65
+ self.batch_heads,
66
+ B_num_block,
67
+ head_dim,
68
+ block_size,
69
+ device=self.device,
70
+ dtype=torch.float32,
71
+ )
72
+ # indices: [batch_size, num_block]
73
+ self.indices = torch.randint(
74
+ 0,
75
+ total_blocks,
76
+ (self.batch_heads, indices_per_block),
77
+ device=self.device,
78
+ dtype=torch.int32,
79
+ )
80
+
81
+ def benchmark_base(self):
82
+ self.out = self.kernel.mm_to_sparse(self.dense_a, self.dense_b, self.indices)
83
+
84
+ def verify_base(self) -> torch.Tensor:
85
+ return mm_to_sparse_reference(self.dense_a, self.dense_b, self.indices)
86
+
87
+ def setup_large(self):
88
+ batch_size = 4
89
+ num_heads = 8
90
+ head_dim = 64
91
+ block_size = 32
92
+
93
+ A_num_block = 8
94
+ B_num_block = 8
95
+ total_blocks = A_num_block * B_num_block
96
+ indices_per_block = 8 # Must be divisible by 4
97
+
98
+ self.batch_heads = batch_size * num_heads
99
+
100
+ self.dense_a = torch.randn(
101
+ self.batch_heads,
102
+ A_num_block,
103
+ head_dim,
104
+ block_size,
105
+ device=self.device,
106
+ dtype=torch.float32,
107
+ )
108
+ self.dense_b = torch.randn(
109
+ self.batch_heads,
110
+ B_num_block,
111
+ head_dim,
112
+ block_size,
113
+ device=self.device,
114
+ dtype=torch.float32,
115
+ )
116
+ self.indices = torch.randint(
117
+ 0,
118
+ total_blocks,
119
+ (self.batch_heads, indices_per_block),
120
+ device=self.device,
121
+ dtype=torch.int32,
122
+ )
123
+
124
+ def benchmark_large(self):
125
+ self.out = self.kernel.mm_to_sparse(self.dense_a, self.dense_b, self.indices)
126
+
127
+ def verify_large(self) -> torch.Tensor:
128
+ return mm_to_sparse_reference(self.dense_a, self.dense_b, self.indices)
build.toml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [general]
2
+ name = "mra"
3
+ universal = false
4
+
5
+ [torch]
6
+ src = [
7
+ "torch-ext/torch_binding.cpp",
8
+ "torch-ext/cuda_launch.h",
9
+ ]
10
+
11
+
12
+ [kernel.mra]
13
+ backend = "cuda"
14
+ depends = ["torch"]
15
+ src = [
16
+ "mra/cuda_kernel.cu",
17
+ "mra/cuda_kernel.h",
18
+ "mra/cuda_launch.cu",
19
+ "mra/cuda_launch.h",
20
+ ]
build/torch210-cu128-x86_64-windows/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._ops import ops
2
+ import torch
3
+
4
+ def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
5
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
6
+
7
+ def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
8
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
9
+
10
+ def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
11
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
12
+
13
+ def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
14
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
15
+
16
+ def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
17
+ return ops.scatter(dense_A, indices, B_num_block)
18
+
19
+ __all__ = [
20
+ "index_max",
21
+ "mm_to_sparse",
22
+ "sparse_dense_mm",
23
+ "reduce_sum",
24
+ "scatter",
25
+ ]
build/torch210-cu128-x86_64-windows/_mra_cuda_6ec000c.pyd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa6a072526b11ba258ee3c95711b1582a501a40829c22bbd62b493730faee0ee
3
+ size 795648
build/torch210-cu128-x86_64-windows/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _mra_cuda_6ec000c
3
+ ops = torch.ops._mra_cuda_6ec000c
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_mra_cuda_6ec000c::{op_name}"
build/torch210-cu128-x86_64-windows/metadata.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "python-depends": [],
4
+ "backend": {
5
+ "type": "cuda",
6
+ "archs": [
7
+ "10.0",
8
+ "10.1",
9
+ "12.0+PTX",
10
+ "7.0",
11
+ "7.2",
12
+ "7.5",
13
+ "8.0",
14
+ "8.6",
15
+ "8.7",
16
+ "8.9",
17
+ "9.0"
18
+ ]
19
+ }
20
+ }
build/torch210-cu128-x86_64-windows/mra/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import sys
3
+
4
+ import importlib
5
+ from pathlib import Path
6
+ from types import ModuleType
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-cxx11-cu126-aarch64-linux/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._ops import ops
2
+ import torch
3
+
4
+ def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
5
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
6
+
7
+ def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
8
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
9
+
10
+ def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
11
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
12
+
13
+ def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
14
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
15
+
16
+ def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
17
+ return ops.scatter(dense_A, indices, B_num_block)
18
+
19
+ __all__ = [
20
+ "index_max",
21
+ "mm_to_sparse",
22
+ "sparse_dense_mm",
23
+ "reduce_sum",
24
+ "scatter",
25
+ ]
build/torch210-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de75db12cb29ce706eba61ef07d7e74f00deea71749fdd8b7bf2d56bf7178105
3
+ size 2567952
build/torch210-cxx11-cu126-aarch64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _mra_cuda_c1eaa2d
3
+ ops = torch.ops._mra_cuda_c1eaa2d
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_mra_cuda_c1eaa2d::{op_name}"
build/torch210-cxx11-cu126-aarch64-linux/metadata.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "python-depends": [],
4
+ "backend": {
5
+ "type": "cuda",
6
+ "archs": [
7
+ "7.0",
8
+ "7.2",
9
+ "7.5",
10
+ "8.0",
11
+ "8.6",
12
+ "8.7",
13
+ "8.9",
14
+ "9.0+PTX"
15
+ ]
16
+ }
17
+ }
build/torch210-cxx11-cu126-aarch64-linux/mra/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-cxx11-cu126-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._ops import ops
2
+ import torch
3
+
4
+ def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
5
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
6
+
7
+ def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
8
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
9
+
10
+ def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
11
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
12
+
13
+ def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
14
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
15
+
16
+ def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
17
+ return ops.scatter(dense_A, indices, B_num_block)
18
+
19
+ __all__ = [
20
+ "index_max",
21
+ "mm_to_sparse",
22
+ "sparse_dense_mm",
23
+ "reduce_sum",
24
+ "scatter",
25
+ ]
build/torch210-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cc021351bfa4e923b15d186877cddf3d935d6223a369f40ffabb12507536e90
3
+ size 2451480
build/torch210-cxx11-cu126-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _mra_cuda_c1eaa2d
3
+ ops = torch.ops._mra_cuda_c1eaa2d
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_mra_cuda_c1eaa2d::{op_name}"
build/torch210-cxx11-cu126-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "python-depends": [],
4
+ "backend": {
5
+ "type": "cuda",
6
+ "archs": [
7
+ "7.0",
8
+ "7.2",
9
+ "7.5",
10
+ "8.0",
11
+ "8.6",
12
+ "8.7",
13
+ "8.9",
14
+ "9.0+PTX"
15
+ ]
16
+ }
17
+ }
build/torch210-cxx11-cu126-x86_64-linux/mra/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-cxx11-cu128-aarch64-linux/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._ops import ops
2
+ import torch
3
+
4
+ def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
5
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
6
+
7
+ def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
8
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
9
+
10
+ def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
11
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
12
+
13
+ def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
14
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
15
+
16
+ def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
17
+ return ops.scatter(dense_A, indices, B_num_block)
18
+
19
+ __all__ = [
20
+ "index_max",
21
+ "mm_to_sparse",
22
+ "sparse_dense_mm",
23
+ "reduce_sum",
24
+ "scatter",
25
+ ]
build/torch210-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c94fe47bd01e60165517510cb90d9f8c1afa4b8092c7a7a25ef971c73a11f41
3
+ size 2830296
build/torch210-cxx11-cu128-aarch64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _mra_cuda_c1eaa2d
3
+ ops = torch.ops._mra_cuda_c1eaa2d
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_mra_cuda_c1eaa2d::{op_name}"
build/torch210-cxx11-cu128-aarch64-linux/metadata.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "python-depends": [],
4
+ "backend": {
5
+ "type": "cuda",
6
+ "archs": [
7
+ "10.0",
8
+ "10.1",
9
+ "12.0+PTX",
10
+ "7.0",
11
+ "7.2",
12
+ "7.5",
13
+ "8.0",
14
+ "8.6",
15
+ "8.7",
16
+ "8.9",
17
+ "9.0"
18
+ ]
19
+ }
20
+ }
build/torch210-cxx11-cu128-aarch64-linux/mra/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-cxx11-cu128-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._ops import ops
2
+ import torch
3
+
4
+ def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
5
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
6
+
7
+ def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
8
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
9
+
10
+ def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
11
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
12
+
13
+ def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
14
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
15
+
16
+ def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
17
+ return ops.scatter(dense_A, indices, B_num_block)
18
+
19
+ __all__ = [
20
+ "index_max",
21
+ "mm_to_sparse",
22
+ "sparse_dense_mm",
23
+ "reduce_sum",
24
+ "scatter",
25
+ ]
build/torch210-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b1ce65f7d848240c848986a70ec25bc6bf1bc53c3046df1461649630afb81f8
3
+ size 2719848
build/torch210-cxx11-cu128-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _mra_cuda_c1eaa2d
3
+ ops = torch.ops._mra_cuda_c1eaa2d
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_mra_cuda_c1eaa2d::{op_name}"
build/torch210-cxx11-cu128-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "python-depends": [],
4
+ "backend": {
5
+ "type": "cuda",
6
+ "archs": [
7
+ "10.0",
8
+ "10.1",
9
+ "12.0+PTX",
10
+ "7.0",
11
+ "7.2",
12
+ "7.5",
13
+ "8.0",
14
+ "8.6",
15
+ "8.7",
16
+ "8.9",
17
+ "9.0"
18
+ ]
19
+ }
20
+ }
build/torch210-cxx11-cu128-x86_64-linux/mra/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-cxx11-cu130-aarch64-linux/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._ops import ops
2
+ import torch
3
+
4
+ def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
5
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
6
+
7
+ def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
8
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
9
+
10
+ def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
11
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
12
+
13
+ def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
14
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
15
+
16
+ def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
17
+ return ops.scatter(dense_A, indices, B_num_block)
18
+
19
+ __all__ = [
20
+ "index_max",
21
+ "mm_to_sparse",
22
+ "sparse_dense_mm",
23
+ "reduce_sum",
24
+ "scatter",
25
+ ]
build/torch210-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1e26fb0737c8f8451d052d2514c36d64150212470214009acf0493b5862fe80
3
+ size 2767768
build/torch210-cxx11-cu130-aarch64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _mra_cuda_c1eaa2d
3
+ ops = torch.ops._mra_cuda_c1eaa2d
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_mra_cuda_c1eaa2d::{op_name}"
build/torch210-cxx11-cu130-aarch64-linux/metadata.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "python-depends": [],
4
+ "backend": {
5
+ "type": "cuda",
6
+ "archs": [
7
+ "10.0",
8
+ "11.0",
9
+ "12.0+PTX",
10
+ "7.5",
11
+ "8.0",
12
+ "8.6",
13
+ "8.7",
14
+ "8.9",
15
+ "9.0"
16
+ ]
17
+ }
18
+ }
build/torch210-cxx11-cu130-aarch64-linux/mra/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-cxx11-cu130-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._ops import ops
2
+ import torch
3
+
4
+ def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
5
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
6
+
7
+ def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
8
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
9
+
10
+ def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
11
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
12
+
13
+ def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
14
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
15
+
16
+ def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
17
+ return ops.scatter(dense_A, indices, B_num_block)
18
+
19
+ __all__ = [
20
+ "index_max",
21
+ "mm_to_sparse",
22
+ "sparse_dense_mm",
23
+ "reduce_sum",
24
+ "scatter",
25
+ ]
build/torch210-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26e6338feb8e2e4589397574e56ccf8b1e2761714e6ae0b5a474030b9e95f4f5
3
+ size 2641368
build/torch210-cxx11-cu130-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _mra_cuda_c1eaa2d
3
+ ops = torch.ops._mra_cuda_c1eaa2d
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_mra_cuda_c1eaa2d::{op_name}"
build/torch210-cxx11-cu130-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "python-depends": [],
4
+ "backend": {
5
+ "type": "cuda",
6
+ "archs": [
7
+ "10.0",
8
+ "11.0",
9
+ "12.0+PTX",
10
+ "7.5",
11
+ "8.0",
12
+ "8.6",
13
+ "8.7",
14
+ "8.9",
15
+ "9.0"
16
+ ]
17
+ }
18
+ }
build/torch210-cxx11-cu130-x86_64-linux/mra/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch211-cxx11-cu126-aarch64-linux/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._ops import ops
2
+ import torch
3
+
4
+ def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
5
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
6
+
7
+ def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
8
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
9
+
10
+ def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
11
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
12
+
13
+ def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
14
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
15
+
16
+ def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
17
+ return ops.scatter(dense_A, indices, B_num_block)
18
+
19
+ __all__ = [
20
+ "index_max",
21
+ "mm_to_sparse",
22
+ "sparse_dense_mm",
23
+ "reduce_sum",
24
+ "scatter",
25
+ ]
build/torch211-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb19769c43d841448daf6deb84ff8358cef905b1df26aed4d60bf38b1ab819e0
3
+ size 2567952
build/torch211-cxx11-cu126-aarch64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _mra_cuda_c1eaa2d
3
+ ops = torch.ops._mra_cuda_c1eaa2d
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_mra_cuda_c1eaa2d::{op_name}"
build/torch211-cxx11-cu126-aarch64-linux/metadata.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "python-depends": [],
4
+ "backend": {
5
+ "type": "cuda",
6
+ "archs": [
7
+ "7.0",
8
+ "7.2",
9
+ "7.5",
10
+ "8.0",
11
+ "8.6",
12
+ "8.7",
13
+ "8.9",
14
+ "9.0+PTX"
15
+ ]
16
+ }
17
+ }
build/torch211-cxx11-cu126-aarch64-linux/mra/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch211-cxx11-cu126-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._ops import ops
2
+ import torch
3
+
4
+ def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
5
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
6
+
7
+ def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
8
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
9
+
10
+ def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
11
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
12
+
13
+ def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
14
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
15
+
16
+ def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
17
+ return ops.scatter(dense_A, indices, B_num_block)
18
+
19
+ __all__ = [
20
+ "index_max",
21
+ "mm_to_sparse",
22
+ "sparse_dense_mm",
23
+ "reduce_sum",
24
+ "scatter",
25
+ ]
build/torch211-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dd2ac9defcbaf5d03db15bc1bd55476e4520c3eb91b157a6f2488d37a16f011
3
+ size 2451480
build/torch211-cxx11-cu126-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _mra_cuda_c1eaa2d
3
+ ops = torch.ops._mra_cuda_c1eaa2d
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_mra_cuda_c1eaa2d::{op_name}"
build/torch211-cxx11-cu126-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 1,
3
+ "python-depends": [],
4
+ "backend": {
5
+ "type": "cuda",
6
+ "archs": [
7
+ "7.0",
8
+ "7.2",
9
+ "7.5",
10
+ "8.0",
11
+ "8.6",
12
+ "8.7",
13
+ "8.9",
14
+ "9.0+PTX"
15
+ ]
16
+ }
17
+ }
build/torch211-cxx11-cu126-x86_64-linux/mra/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch211-cxx11-cu128-aarch64-linux/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ._ops import ops
2
+ import torch
3
+
4
+ def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
5
+ return ops.index_max(index_vals, indices, A_num_block, B_num_block)
6
+
7
+ def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
8
+ return ops.mm_to_sparse(dense_A, dense_B, indices)
9
+
10
+ def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
11
+ return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
12
+
13
+ def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
14
+ return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
15
+
16
+ def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
17
+ return ops.scatter(dense_A, indices, B_num_block)
18
+
19
+ __all__ = [
20
+ "index_max",
21
+ "mm_to_sparse",
22
+ "sparse_dense_mm",
23
+ "reduce_sum",
24
+ "scatter",
25
+ ]