drbh commited on 16 days ago

Commit

2059e46

unverified ·

0 Parent(s):

Migrated from kernels-community/mra

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +116 -0
README.md +16 -0
benchmarks/benchmark.py +128 -0
build.toml +20 -0
build/torch210-cu128-x86_64-windows/__init__.py +25 -0
build/torch210-cu128-x86_64-windows/_mra_cuda_6ec000c.pyd +3 -0
build/torch210-cu128-x86_64-windows/_ops.py +9 -0
build/torch210-cu128-x86_64-windows/metadata.json +20 -0
build/torch210-cu128-x86_64-windows/mra/__init__.py +26 -0
build/torch210-cxx11-cu126-aarch64-linux/__init__.py +25 -0
build/torch210-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
build/torch210-cxx11-cu126-aarch64-linux/_ops.py +9 -0
build/torch210-cxx11-cu126-aarch64-linux/metadata.json +17 -0
build/torch210-cxx11-cu126-aarch64-linux/mra/__init__.py +26 -0
build/torch210-cxx11-cu126-x86_64-linux/__init__.py +25 -0
build/torch210-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
build/torch210-cxx11-cu126-x86_64-linux/_ops.py +9 -0
build/torch210-cxx11-cu126-x86_64-linux/metadata.json +17 -0
build/torch210-cxx11-cu126-x86_64-linux/mra/__init__.py +26 -0
build/torch210-cxx11-cu128-aarch64-linux/__init__.py +25 -0
build/torch210-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
build/torch210-cxx11-cu128-aarch64-linux/_ops.py +9 -0
build/torch210-cxx11-cu128-aarch64-linux/metadata.json +20 -0
build/torch210-cxx11-cu128-aarch64-linux/mra/__init__.py +26 -0
build/torch210-cxx11-cu128-x86_64-linux/__init__.py +25 -0
build/torch210-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
build/torch210-cxx11-cu128-x86_64-linux/_ops.py +9 -0
build/torch210-cxx11-cu128-x86_64-linux/metadata.json +20 -0
build/torch210-cxx11-cu128-x86_64-linux/mra/__init__.py +26 -0
build/torch210-cxx11-cu130-aarch64-linux/__init__.py +25 -0
build/torch210-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
build/torch210-cxx11-cu130-aarch64-linux/_ops.py +9 -0
build/torch210-cxx11-cu130-aarch64-linux/metadata.json +18 -0
build/torch210-cxx11-cu130-aarch64-linux/mra/__init__.py +26 -0
build/torch210-cxx11-cu130-x86_64-linux/__init__.py +25 -0
build/torch210-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
build/torch210-cxx11-cu130-x86_64-linux/_ops.py +9 -0
build/torch210-cxx11-cu130-x86_64-linux/metadata.json +18 -0
build/torch210-cxx11-cu130-x86_64-linux/mra/__init__.py +26 -0
build/torch211-cxx11-cu126-aarch64-linux/__init__.py +25 -0
build/torch211-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
build/torch211-cxx11-cu126-aarch64-linux/_ops.py +9 -0
build/torch211-cxx11-cu126-aarch64-linux/metadata.json +17 -0
build/torch211-cxx11-cu126-aarch64-linux/mra/__init__.py +26 -0
build/torch211-cxx11-cu126-x86_64-linux/__init__.py +25 -0
build/torch211-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so +3 -0
build/torch211-cxx11-cu126-x86_64-linux/_ops.py +9 -0
build/torch211-cxx11-cu126-x86_64-linux/metadata.json +17 -0
build/torch211-cxx11-cu126-x86_64-linux/mra/__init__.py +26 -0
build/torch211-cxx11-cu128-aarch64-linux/__init__.py +25 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,116 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu118-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu126-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu128-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/mra/_mra_e8307c7_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu118-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu126-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu128-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/mra/_mra_9e0f4db.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_mra_b91b835.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_mra_c02bdb1.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_mra_7f45e67.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_mra_41ac1dc.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-aarch64-linux/_mra_cuda_8d73b81.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cu128-x86_64-windows/_mra_cuda_6ec000c.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu129-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu129-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,16 @@

+---
+tags:
+- kernels
+- cuda
+---
+MRA kernels for transformers
+### Performance
+<img class="dark:hidden border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_light_animation.svg" />
+<img class="hidden dark:block border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_dark_animation.svg" />
+<img class="dark:hidden border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_light_latency.svg" />
+<img class="hidden dark:block border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_dark_latency.svg" />
+<img class="dark:hidden border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_light_throughput.svg" />
+<img class="hidden dark:block border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_dark_throughput.svg" />

benchmarks/benchmark.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import torch
+from kernels.benchmark import Benchmark
+def mm_to_sparse_reference(
+    dense_A: torch.Tensor,
+    dense_B: torch.Tensor,
+    indices: torch.Tensor,
+) -> torch.Tensor:
+    batch_size = dense_A.size(0)
+    A_num_block = dense_A.size(1)
+    B_num_block = dense_B.size(1)
+    dim = dense_A.size(2)
+    num_block = indices.size(1)
+    # Output: (batch_size, num_block, 32, 32)
+    sparse_C = torch.zeros(
+        batch_size, num_block, 32, 32, device=dense_A.device, dtype=dense_A.dtype
+    )
+    for b in range(batch_size):
+        for blk in range(num_block):
+            AB_idx = indices[b, blk].item()
+            A_idx = AB_idx // B_num_block
+            B_idx = AB_idx % B_num_block
+            A_block = dense_A[b, A_idx]  # (dim, 32)
+            B_block = dense_B[b, B_idx]  # (dim, 32)
+            # Kernel computes C = B.T @ A: (32, dim) @ (dim, 32) = (32, 32)
+            sparse_C[b, blk] = B_block.T @ A_block
+    return sparse_C
+class MRABenchmark(Benchmark):
+    seed: int = 42
+    def setup(self):
+        # Config matching the kernel's expected format
+        batch_size = 2
+        num_heads = 8
+        head_dim = 64
+        block_size = 32  # Fixed by kernel
+        A_num_block = 4
+        B_num_block = 4
+        total_blocks = A_num_block * B_num_block
+        indices_per_block = 4  # Must be divisible by 4
+        self.batch_heads = batch_size * num_heads
+        # dense_A: [batch_size, A_num_block, dim, 32]
+        self.dense_a = torch.randn(
+            self.batch_heads,
+            A_num_block,
+            head_dim,
+            block_size,
+            device=self.device,
+            dtype=torch.float32,
+        )
+        # dense_B: [batch_size, B_num_block, dim, 32]
+        self.dense_b = torch.randn(
+            self.batch_heads,
+            B_num_block,
+            head_dim,
+            block_size,
+            device=self.device,
+            dtype=torch.float32,
+        )
+        # indices: [batch_size, num_block]
+        self.indices = torch.randint(
+            0,
+            total_blocks,
+            (self.batch_heads, indices_per_block),
+            device=self.device,
+            dtype=torch.int32,
+        )
+    def benchmark_base(self):
+        self.out = self.kernel.mm_to_sparse(self.dense_a, self.dense_b, self.indices)
+    def verify_base(self) -> torch.Tensor:
+        return mm_to_sparse_reference(self.dense_a, self.dense_b, self.indices)
+    def setup_large(self):
+        batch_size = 4
+        num_heads = 8
+        head_dim = 64
+        block_size = 32
+        A_num_block = 8
+        B_num_block = 8
+        total_blocks = A_num_block * B_num_block
+        indices_per_block = 8  # Must be divisible by 4
+        self.batch_heads = batch_size * num_heads
+        self.dense_a = torch.randn(
+            self.batch_heads,
+            A_num_block,
+            head_dim,
+            block_size,
+            device=self.device,
+            dtype=torch.float32,
+        )
+        self.dense_b = torch.randn(
+            self.batch_heads,
+            B_num_block,
+            head_dim,
+            block_size,
+            device=self.device,
+            dtype=torch.float32,
+        )
+        self.indices = torch.randint(
+            0,
+            total_blocks,
+            (self.batch_heads, indices_per_block),
+            device=self.device,
+            dtype=torch.int32,
+        )
+    def benchmark_large(self):
+        self.out = self.kernel.mm_to_sparse(self.dense_a, self.dense_b, self.indices)
+    def verify_large(self) -> torch.Tensor:
+        return mm_to_sparse_reference(self.dense_a, self.dense_b, self.indices)

build.toml ADDED Viewed

	@@ -0,0 +1,20 @@

+[general]
+name = "mra"
+universal = false
+[torch]
+src = [
+    "torch-ext/torch_binding.cpp",
+    "torch-ext/cuda_launch.h",
+]
+[kernel.mra]
+backend = "cuda"
+depends = ["torch"]
+src = [
+    "mra/cuda_kernel.cu",
+    "mra/cuda_kernel.h",
+    "mra/cuda_launch.cu",
+    "mra/cuda_launch.h",
+]

build/torch210-cu128-x86_64-windows/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from ._ops import ops
+import torch
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+    return ops.mm_to_sparse(dense_A, dense_B, indices)
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+    return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+    return ops.scatter(dense_A, indices, B_num_block)
+__all__ = [
+    "index_max",
+    "mm_to_sparse",
+    "sparse_dense_mm",
+    "reduce_sum",
+    "scatter",
+]

build/torch210-cu128-x86_64-windows/_mra_cuda_6ec000c.pyd ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa6a072526b11ba258ee3c95711b1582a501a40829c22bbd62b493730faee0ee
+size 795648

build/torch210-cu128-x86_64-windows/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _mra_cuda_6ec000c
+ops = torch.ops._mra_cuda_6ec000c
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_mra_cuda_6ec000c::{op_name}"

build/torch210-cu128-x86_64-windows/metadata.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "version": 1,
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}

build/torch210-cu128-x86_64-windows/mra/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import sys
+import importlib
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu126-aarch64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from ._ops import ops
+import torch
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+    return ops.mm_to_sparse(dense_A, dense_B, indices)
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+    return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+    return ops.scatter(dense_A, indices, B_num_block)
+__all__ = [
+    "index_max",
+    "mm_to_sparse",
+    "sparse_dense_mm",
+    "reduce_sum",
+    "scatter",
+]

build/torch210-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de75db12cb29ce706eba61ef07d7e74f00deea71749fdd8b7bf2d56bf7178105
+size 2567952

build/torch210-cxx11-cu126-aarch64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_mra_cuda_c1eaa2d::{op_name}"

build/torch210-cxx11-cu126-aarch64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "version": 1,
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0+PTX"
+    ]
+  }
+}

build/torch210-cxx11-cu126-aarch64-linux/mra/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu126-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from ._ops import ops
+import torch
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+    return ops.mm_to_sparse(dense_A, dense_B, indices)
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+    return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+    return ops.scatter(dense_A, indices, B_num_block)
+__all__ = [
+    "index_max",
+    "mm_to_sparse",
+    "sparse_dense_mm",
+    "reduce_sum",
+    "scatter",
+]

build/torch210-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cc021351bfa4e923b15d186877cddf3d935d6223a369f40ffabb12507536e90
+size 2451480

build/torch210-cxx11-cu126-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_mra_cuda_c1eaa2d::{op_name}"

build/torch210-cxx11-cu126-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "version": 1,
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0+PTX"
+    ]
+  }
+}

build/torch210-cxx11-cu126-x86_64-linux/mra/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu128-aarch64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from ._ops import ops
+import torch
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+    return ops.mm_to_sparse(dense_A, dense_B, indices)
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+    return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+    return ops.scatter(dense_A, indices, B_num_block)
+__all__ = [
+    "index_max",
+    "mm_to_sparse",
+    "sparse_dense_mm",
+    "reduce_sum",
+    "scatter",
+]

build/torch210-cxx11-cu128-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c94fe47bd01e60165517510cb90d9f8c1afa4b8092c7a7a25ef971c73a11f41
+size 2830296

build/torch210-cxx11-cu128-aarch64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_mra_cuda_c1eaa2d::{op_name}"

build/torch210-cxx11-cu128-aarch64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "version": 1,
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}

build/torch210-cxx11-cu128-aarch64-linux/mra/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu128-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from ._ops import ops
+import torch
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+    return ops.mm_to_sparse(dense_A, dense_B, indices)
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+    return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+    return ops.scatter(dense_A, indices, B_num_block)
+__all__ = [
+    "index_max",
+    "mm_to_sparse",
+    "sparse_dense_mm",
+    "reduce_sum",
+    "scatter",
+]

build/torch210-cxx11-cu128-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b1ce65f7d848240c848986a70ec25bc6bf1bc53c3046df1461649630afb81f8
+size 2719848

build/torch210-cxx11-cu128-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_mra_cuda_c1eaa2d::{op_name}"

build/torch210-cxx11-cu128-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "version": 1,
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}

build/torch210-cxx11-cu128-x86_64-linux/mra/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu130-aarch64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from ._ops import ops
+import torch
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+    return ops.mm_to_sparse(dense_A, dense_B, indices)
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+    return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+    return ops.scatter(dense_A, indices, B_num_block)
+__all__ = [
+    "index_max",
+    "mm_to_sparse",
+    "sparse_dense_mm",
+    "reduce_sum",
+    "scatter",
+]

build/torch210-cxx11-cu130-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1e26fb0737c8f8451d052d2514c36d64150212470214009acf0493b5862fe80
+size 2767768

build/torch210-cxx11-cu130-aarch64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_mra_cuda_c1eaa2d::{op_name}"

build/torch210-cxx11-cu130-aarch64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "version": 1,
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}

build/torch210-cxx11-cu130-aarch64-linux/mra/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu130-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from ._ops import ops
+import torch
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+    return ops.mm_to_sparse(dense_A, dense_B, indices)
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+    return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+    return ops.scatter(dense_A, indices, B_num_block)
+__all__ = [
+    "index_max",
+    "mm_to_sparse",
+    "sparse_dense_mm",
+    "reduce_sum",
+    "scatter",
+]

build/torch210-cxx11-cu130-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:26e6338feb8e2e4589397574e56ccf8b1e2761714e6ae0b5a474030b9e95f4f5
+size 2641368

build/torch210-cxx11-cu130-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_mra_cuda_c1eaa2d::{op_name}"

build/torch210-cxx11-cu130-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "version": 1,
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}

build/torch210-cxx11-cu130-x86_64-linux/mra/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch211-cxx11-cu126-aarch64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from ._ops import ops
+import torch
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+    return ops.mm_to_sparse(dense_A, dense_B, indices)
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+    return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+    return ops.scatter(dense_A, indices, B_num_block)
+__all__ = [
+    "index_max",
+    "mm_to_sparse",
+    "sparse_dense_mm",
+    "reduce_sum",
+    "scatter",
+]

build/torch211-cxx11-cu126-aarch64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb19769c43d841448daf6deb84ff8358cef905b1df26aed4d60bf38b1ab819e0
+size 2567952

build/torch211-cxx11-cu126-aarch64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_mra_cuda_c1eaa2d::{op_name}"

build/torch211-cxx11-cu126-aarch64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "version": 1,
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0+PTX"
+    ]
+  }
+}

build/torch211-cxx11-cu126-aarch64-linux/mra/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch211-cxx11-cu126-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from ._ops import ops
+import torch
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+    return ops.mm_to_sparse(dense_A, dense_B, indices)
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+    return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+    return ops.scatter(dense_A, indices, B_num_block)
+__all__ = [
+    "index_max",
+    "mm_to_sparse",
+    "sparse_dense_mm",
+    "reduce_sum",
+    "scatter",
+]

build/torch211-cxx11-cu126-x86_64-linux/_mra_cuda_c1eaa2d.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5dd2ac9defcbaf5d03db15bc1bd55476e4520c3eb91b157a6f2488d37a16f011
+size 2451480

build/torch211-cxx11-cu126-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _mra_cuda_c1eaa2d
+ops = torch.ops._mra_cuda_c1eaa2d
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_mra_cuda_c1eaa2d::{op_name}"

build/torch211-cxx11-cu126-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "version": 1,
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0+PTX"
+    ]
+  }
+}

build/torch211-cxx11-cu126-x86_64-linux/mra/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch211-cxx11-cu128-aarch64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from ._ops import ops
+import torch
+def index_max(index_vals: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.index_max(index_vals, indices, A_num_block, B_num_block)
+def mm_to_sparse(dense_A: torch.Tensor, dense_B: torch.Tensor, indices: torch.Tensor):
+    return ops.mm_to_sparse(dense_A, dense_B, indices)
+def sparse_dense_mm(sparse_A: torch.Tensor, indices: torch.Tensor, dense_B: torch.Tensor, A_num_block: int):
+    return ops.sparse_dense_mm(sparse_A, indices, dense_B, A_num_block)
+def reduce_sum(sparse_A: torch.Tensor, indices: torch.Tensor, A_num_block: int, B_num_block: int):
+    return ops.reduce_sum(sparse_A, indices, A_num_block, B_num_block)
+def scatter(dense_A: torch.Tensor, indices: torch.Tensor, B_num_block: int):
+    return ops.scatter(dense_A, indices, B_num_block)
+__all__ = [
+    "index_max",
+    "mm_to_sparse",
+    "sparse_dense_mm",
+    "reduce_sum",
+    "scatter",
+]