diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..fd52f86ed5c4530b974ed742ff1d8e4c148bd9aa
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,107 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu118-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu126-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch27-cxx11-cu128-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_yoso_ea085fb.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_yoso_ea085fb.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_yoso_ea085fb.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/_yoso_ea085fb.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/_yoso_ea085fb.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/_yoso_ea085fb.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_yoso_ea085fb.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_yoso_ea085fb.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_yoso_ea085fb.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_yoso_fc1f972.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_yoso_fc1f972.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_yoso_fc1f972.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/_yoso_fc1f972.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/_yoso_fc1f972.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/_yoso_fc1f972.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_yoso_fc1f972.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_yoso_fc1f972.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_yoso_fc1f972.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_yoso_c11288f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_yoso_c11288f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_yoso_c11288f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/_yoso_c11288f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/_yoso_c11288f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/_yoso_c11288f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_yoso_c11288f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_yoso_c11288f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_yoso_c11288f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_yoso_dd38bd3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_yoso_dd38bd3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_yoso_dd38bd3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu126-x86_64-linux/_yoso_dd38bd3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu128-x86_64-linux/_yoso_dd38bd3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch28-cxx11-cu129-x86_64-linux/_yoso_dd38bd3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_yoso_dd38bd3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_yoso_dd38bd3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_yoso_dd38bd3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu126-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu128-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu130-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cu128-x86_64-windows/_yoso_cuda_400d834.pyd filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-aarch64-linux/_yoso_cuda_4218a9a.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-aarch64-linux/_yoso_cuda_4218a9a.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-aarch64-linux/_yoso_cuda_4218a9a.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-aarch64-linux/_yoso_cuda_4218a9a.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-aarch64-linux/_yoso_cuda_4218a9a.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-aarch64-linux/_yoso_cuda_4218a9a.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu129-aarch64-linux/_yoso_cuda_4218a9a.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu126-x86_64-linux/_yoso_cuda_4218a9a.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-x86_64-linux/_yoso_cuda_4218a9a.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu130-x86_64-linux/_yoso_cuda_4218a9a.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu126-x86_64-linux/_yoso_cuda_4218a9a.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu128-x86_64-linux/_yoso_cuda_4218a9a.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch211-cxx11-cu130-x86_64-linux/_yoso_cuda_4218a9a.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch29-cxx11-cu129-x86_64-linux/_yoso_cuda_4218a9a.abi3.so filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cc3a6154dce7ee47dcddbeddd4b0e5c374cdde48
--- /dev/null
+++ b/README.md
@@ -0,0 +1,15 @@
+---
+tags:
+- kernels
+- cuda
+---
+### Performance
+
+
+
+
+
+
+
+
+
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..27ee7ccfe9593be013b7e853a13c60b09887dad2
--- /dev/null
+++ b/benchmarks/benchmark.py
@@ -0,0 +1,239 @@
+import torch
+
+from kernels.benchmark import Benchmark
+
+
+def lsh_weighted_cumulation_reference(
+ query_mask: torch.Tensor,
+ query_hash_code: torch.Tensor,
+ query_weight: torch.Tensor,
+ key_mask: torch.Tensor,
+ key_hash_code: torch.Tensor,
+ key_weight: torch.Tensor,
+ value: torch.Tensor,
+ hashtable_capacity: int,
+) -> torch.Tensor:
+ batch_size, num_query, num_hash_f = query_hash_code.shape
+ _, num_key, value_dim = value.shape
+ weight_dim = query_weight.shape[2]
+ device = value.device
+ dtype = value.dtype
+
+ output = torch.zeros(batch_size, num_query, value_dim, device=device, dtype=dtype)
+
+ for b in range(batch_size):
+ for weight_idx in range(weight_dim):
+ # Build hashtables for all hash functions
+ hashtables = torch.zeros(
+ num_hash_f, hashtable_capacity, value_dim, device=device, dtype=dtype
+ )
+
+ k_mask = key_mask[b, :].float() # [num_key]
+ k_weight_val = key_weight[b, :, weight_idx] # [num_key]
+
+ for h in range(num_hash_f):
+ k_hash = key_hash_code[b, :, h].long() # [num_key]
+ # Weighted values: [num_key, value_dim]
+ weighted_values = (
+ k_mask.unsqueeze(-1) * k_weight_val.unsqueeze(-1) * value[b]
+ )
+ k_hash_expanded = k_hash.unsqueeze(-1).expand(-1, value_dim)
+ hashtables[h].scatter_add_(0, k_hash_expanded, weighted_values)
+
+ # Query: sum over all hash functions
+ q_mask = query_mask[b, :].float() # [num_query]
+ q_weight_val = query_weight[b, :, weight_idx] # [num_query]
+
+ sum_val = torch.zeros(num_query, value_dim, device=device, dtype=dtype)
+ for h in range(num_hash_f):
+ q_hash = query_hash_code[b, :, h].long() # [num_query]
+ gathered = hashtables[h][q_hash] # [num_query, value_dim]
+ sum_val += gathered
+
+ # Apply query weight and divide by num_hash_f
+ output[b] += (
+ q_mask.unsqueeze(-1) * q_weight_val.unsqueeze(-1) * sum_val / num_hash_f
+ )
+
+ return output
+
+
+class YosoBenchmark(Benchmark):
+ seed: int = 42
+
+ def setup(self):
+ batch_size = 2
+ num_query = 128
+ num_key = 128
+ dim = 64
+ self.num_hash_f = 32
+ self.hash_code_len = 9
+ self.weight_dim = self.num_hash_f
+ self.value_dim = dim
+ self.hashtable_capacity = 1 << self.hash_code_len
+
+ self.query_mask = torch.ones(
+ batch_size, num_query, device=self.device, dtype=torch.int32
+ )
+ self.query_vector = torch.randn(
+ batch_size, num_query, dim, device=self.device, dtype=torch.float32
+ )
+ self.key_mask = torch.ones(
+ batch_size, num_key, device=self.device, dtype=torch.int32
+ )
+ self.key_vector = torch.randn(
+ batch_size, num_key, dim, device=self.device, dtype=torch.float32
+ )
+ self.value = torch.randn(
+ batch_size, num_key, self.value_dim, device=self.device, dtype=torch.float32
+ )
+ self.query_weight = torch.randn(
+ batch_size,
+ num_query,
+ self.weight_dim,
+ device=self.device,
+ dtype=torch.float32,
+ )
+ self.key_weight = torch.randn(
+ batch_size,
+ num_key,
+ self.weight_dim,
+ device=self.device,
+ dtype=torch.float32,
+ )
+
+ # Pre-compute hash codes for cumulation benchmarks
+ hash_result = self.kernel.fast_hash(
+ self.query_mask,
+ self.query_vector,
+ self.key_mask,
+ self.key_vector,
+ self.num_hash_f,
+ self.hash_code_len,
+ True,
+ 1,
+ )
+ self.query_hash_code = hash_result[0]
+ self.key_hash_code = hash_result[1]
+
+ self.out = torch.empty(
+ batch_size,
+ num_query,
+ self.value_dim,
+ device=self.device,
+ dtype=torch.float32,
+ )
+
+ def benchmark_base(self):
+ self.out = self.kernel.lsh_weighted_cumulation(
+ self.query_mask,
+ self.query_hash_code,
+ self.query_weight,
+ self.key_mask,
+ self.key_hash_code,
+ self.key_weight,
+ self.value,
+ self.hashtable_capacity,
+ True,
+ 1,
+ )
+
+ def verify_base(self) -> torch.Tensor:
+ return lsh_weighted_cumulation_reference(
+ self.query_mask,
+ self.query_hash_code,
+ self.query_weight,
+ self.key_mask,
+ self.key_hash_code,
+ self.key_weight,
+ self.value,
+ self.hashtable_capacity,
+ )
+
+ def setup_large(self):
+ batch_size = 4
+ num_query = 512
+ num_key = 512
+ dim = 128
+ self.num_hash_f = 32
+ self.hash_code_len = 9
+ self.weight_dim = self.num_hash_f
+ self.value_dim = dim
+ self.hashtable_capacity = 1 << self.hash_code_len
+
+ self.query_mask = torch.ones(
+ batch_size, num_query, device=self.device, dtype=torch.int32
+ )
+ self.query_vector = torch.randn(
+ batch_size, num_query, dim, device=self.device, dtype=torch.float32
+ )
+ self.key_mask = torch.ones(
+ batch_size, num_key, device=self.device, dtype=torch.int32
+ )
+ self.key_vector = torch.randn(
+ batch_size, num_key, dim, device=self.device, dtype=torch.float32
+ )
+ self.value = torch.randn(
+ batch_size, num_key, self.value_dim, device=self.device, dtype=torch.float32
+ )
+ self.query_weight = torch.randn(
+ batch_size,
+ num_query,
+ self.weight_dim,
+ device=self.device,
+ dtype=torch.float32,
+ )
+ self.key_weight = torch.randn(
+ batch_size,
+ num_key,
+ self.weight_dim,
+ device=self.device,
+ dtype=torch.float32,
+ )
+
+ hash_result = self.kernel.fast_hash(
+ self.query_mask,
+ self.query_vector,
+ self.key_mask,
+ self.key_vector,
+ self.num_hash_f,
+ self.hash_code_len,
+ True,
+ 1,
+ )
+ self.query_hash_code = hash_result[0]
+ self.key_hash_code = hash_result[1]
+
+ self.out = torch.empty(
+ batch_size,
+ num_query,
+ self.value_dim,
+ device=self.device,
+ dtype=torch.float32,
+ )
+
+ def benchmark_large(self):
+ self.out = self.kernel.lsh_weighted_cumulation(
+ self.query_mask,
+ self.query_hash_code,
+ self.query_weight,
+ self.key_mask,
+ self.key_hash_code,
+ self.key_weight,
+ self.value,
+ self.hashtable_capacity,
+ True,
+ 1,
+ )
+
+ def verify_large(self) -> torch.Tensor:
+ return lsh_weighted_cumulation_reference(
+ self.query_mask,
+ self.query_hash_code,
+ self.query_weight,
+ self.key_mask,
+ self.key_hash_code,
+ self.key_weight,
+ self.value,
+ self.hashtable_capacity,
+ )
diff --git a/build.toml b/build.toml
new file mode 100644
index 0000000000000000000000000000000000000000..9e343f57edc79fac7030dfd03ec71bb937490da2
--- /dev/null
+++ b/build.toml
@@ -0,0 +1,32 @@
+[general]
+name = "yoso"
+universal = false
+
+[torch]
+src = [
+ "torch-ext/torch_binding.cpp",
+ "torch-ext/torch_binding.h",
+]
+
+[kernel.yoso]
+depends = ["torch"]
+backend = "cuda"
+cuda-capabilities = [
+ "8.0",
+ "8.9",
+ "9.0",
+ "10.0",
+ "12.0",
+]
+include = ["."]
+src = [
+ "yoso/fast_lsh_cumulation_cuda.cu",
+ "yoso/fast_lsh_cumulation_cuda.h",
+ "yoso/fast_lsh_cumulation_torch.cpp",
+ "yoso/fast_lsh_cumulation.h",
+ "yoso/fast_lsh_cumulation.cu",
+ "yoso/common_cuda.h",
+ "yoso/common.h",
+ "yoso/common_cuda_device.h",
+]
+
diff --git a/build/torch210-cu128-x86_64-windows/__init__.py b/build/torch210-cu128-x86_64-windows/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..824563bb41deb6a540a5813c273d3d609d41c001
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch210-cu128-x86_64-windows/_ops.py b/build/torch210-cu128-x86_64-windows/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca23ceb1a293ff4dc94b434c80d53bbf4d461519
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_400d834
+ops = torch.ops._yoso_cuda_400d834
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_400d834::{op_name}"
diff --git a/build/torch210-cu128-x86_64-windows/_yoso_cuda_400d834.pyd b/build/torch210-cu128-x86_64-windows/_yoso_cuda_400d834.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..a4d26e59520f464261154f5b8ed9da2c75b02415
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/_yoso_cuda_400d834.pyd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:698d4c7581229298f15ebcdcbf056ef8331d7e50fc3d510a3b1a9a6a9fdf22ca
+size 1112576
diff --git a/build/torch210-cu128-x86_64-windows/metadata.json b/build/torch210-cu128-x86_64-windows/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..57f200d55c245a3d8df7e45f27810598b0d28011
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cu128-x86_64-windows/yoso/__init__.py b/build/torch210-cu128-x86_64-windows/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc434ef44e63409acb52a8f3fff54a4adc46ed6a
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_ops.py b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85a3d44eb0827816e9c75b218f92ec64b9bf0e0
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_4218a9a
+ops = torch.ops._yoso_cuda_4218a9a
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_4218a9a::{op_name}"
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_yoso_cuda_4218a9a.abi3.so b/build/torch210-cxx11-cu126-aarch64-linux/_yoso_cuda_4218a9a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..a384ede3a144e469263e547e2aa59954b6f36056
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_yoso_cuda_4218a9a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bb09bdc0593bb01cea62105e034943212b28cee89082a44c7412adc87943215
+size 2569064
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/metadata.json b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f1f230ee52961bfaab0e071495b170786f009a9
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,12 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/yoso/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_ops.py b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85a3d44eb0827816e9c75b218f92ec64b9bf0e0
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_4218a9a
+ops = torch.ops._yoso_cuda_4218a9a
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_4218a9a::{op_name}"
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_yoso_cuda_4218a9a.abi3.so b/build/torch210-cxx11-cu126-x86_64-linux/_yoso_cuda_4218a9a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..9f15c1713692489dd5de21f384f75101cbd57f5f
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_yoso_cuda_4218a9a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02ce77a1a0e640f167c2542c3902706d15891b05c813b38d4435cf12088e4771
+size 2498248
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/metadata.json b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f1f230ee52961bfaab0e071495b170786f009a9
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,12 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/yoso/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_ops.py b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85a3d44eb0827816e9c75b218f92ec64b9bf0e0
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_4218a9a
+ops = torch.ops._yoso_cuda_4218a9a
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_4218a9a::{op_name}"
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_yoso_cuda_4218a9a.abi3.so b/build/torch210-cxx11-cu128-aarch64-linux/_yoso_cuda_4218a9a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..119dd3f1aef46cead798a4453dfdcb6ea1801bc7
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_yoso_cuda_4218a9a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91b0d7b3accfb6b76ab9dfbe11942158cf8648fd3ad6f7915b15eb3dc2add36f
+size 3159096
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/metadata.json b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d6fa98528e8834b54b6233c7bc3038ee4b1a15
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/yoso/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_ops.py b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85a3d44eb0827816e9c75b218f92ec64b9bf0e0
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_4218a9a
+ops = torch.ops._yoso_cuda_4218a9a
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_4218a9a::{op_name}"
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_yoso_cuda_4218a9a.abi3.so b/build/torch210-cxx11-cu128-x86_64-linux/_yoso_cuda_4218a9a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..74e114f20724f6011098bad6ad2834fefdf8236d
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_yoso_cuda_4218a9a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65e95d52be57053b7f09b0880484c5debf4e542ac257cb3683105305df39d839
+size 3040824
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/metadata.json b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d6fa98528e8834b54b6233c7bc3038ee4b1a15
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/yoso/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_ops.py b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85a3d44eb0827816e9c75b218f92ec64b9bf0e0
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_4218a9a
+ops = torch.ops._yoso_cuda_4218a9a
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_4218a9a::{op_name}"
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_yoso_cuda_4218a9a.abi3.so b/build/torch210-cxx11-cu130-aarch64-linux/_yoso_cuda_4218a9a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..e730b67bbb6d707a1286be21740b2ce041c3ca01
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_yoso_cuda_4218a9a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18a1cbf153beac65c36a3fcf2fdb16d6c22e8e962ebb3e4b91e7c3a927c98315
+size 3230640
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/metadata.json b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d6fa98528e8834b54b6233c7bc3038ee4b1a15
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/yoso/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_ops.py b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85a3d44eb0827816e9c75b218f92ec64b9bf0e0
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_4218a9a
+ops = torch.ops._yoso_cuda_4218a9a
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_4218a9a::{op_name}"
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_yoso_cuda_4218a9a.abi3.so b/build/torch210-cxx11-cu130-x86_64-linux/_yoso_cuda_4218a9a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..e73f9f7e60c89fcf14eb10dd281a5a112aa3a58b
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_yoso_cuda_4218a9a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a4d07f13664724f51505edd45c4656f658f5d3ed8240f70d896214a8f15cac1
+size 3102168
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/metadata.json b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d6fa98528e8834b54b6233c7bc3038ee4b1a15
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/yoso/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_ops.py b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85a3d44eb0827816e9c75b218f92ec64b9bf0e0
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_4218a9a
+ops = torch.ops._yoso_cuda_4218a9a
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_4218a9a::{op_name}"
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_yoso_cuda_4218a9a.abi3.so b/build/torch211-cxx11-cu126-aarch64-linux/_yoso_cuda_4218a9a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..a948185ab40245de527ede6910415b89afe90e30
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_yoso_cuda_4218a9a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17de742590f0b9c20b51ad23d0fdc7a8d2c6b4d8c5238aeadce31631ac1d8c12
+size 2569064
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/metadata.json b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f1f230ee52961bfaab0e071495b170786f009a9
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,12 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/yoso/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_ops.py b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85a3d44eb0827816e9c75b218f92ec64b9bf0e0
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_4218a9a
+ops = torch.ops._yoso_cuda_4218a9a
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_4218a9a::{op_name}"
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_yoso_cuda_4218a9a.abi3.so b/build/torch211-cxx11-cu126-x86_64-linux/_yoso_cuda_4218a9a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..b69215793ddbd58275fd05036b15774e7091af9b
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_yoso_cuda_4218a9a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1567356a387001e1b7c487e2eb191443b9bb46f9e593455609215970c6d96f60
+size 2498248
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/metadata.json b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f1f230ee52961bfaab0e071495b170786f009a9
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,12 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/yoso/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_ops.py b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85a3d44eb0827816e9c75b218f92ec64b9bf0e0
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_4218a9a
+ops = torch.ops._yoso_cuda_4218a9a
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_4218a9a::{op_name}"
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_yoso_cuda_4218a9a.abi3.so b/build/torch211-cxx11-cu128-aarch64-linux/_yoso_cuda_4218a9a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..e19f554ed4d6d206a1b147214b2bd63ef77f074c
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_yoso_cuda_4218a9a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4ec369a71705f351bfe2a82729a3c0051c6dc5a89b15fc43a884fd84fed471e
+size 3159096
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/metadata.json b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d6fa98528e8834b54b6233c7bc3038ee4b1a15
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/yoso/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_ops.py b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85a3d44eb0827816e9c75b218f92ec64b9bf0e0
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_4218a9a
+ops = torch.ops._yoso_cuda_4218a9a
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_4218a9a::{op_name}"
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_yoso_cuda_4218a9a.abi3.so b/build/torch211-cxx11-cu128-x86_64-linux/_yoso_cuda_4218a9a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..85abcdbb9248173170b2894158d8b4a249515a0d
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_yoso_cuda_4218a9a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2dc6a83288555819f366d42014810ef3f06853e88fd909f34a11810309822f9
+size 3040824
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/metadata.json b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d6fa98528e8834b54b6233c7bc3038ee4b1a15
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/yoso/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_ops.py b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85a3d44eb0827816e9c75b218f92ec64b9bf0e0
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_4218a9a
+ops = torch.ops._yoso_cuda_4218a9a
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_4218a9a::{op_name}"
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_yoso_cuda_4218a9a.abi3.so b/build/torch211-cxx11-cu130-aarch64-linux/_yoso_cuda_4218a9a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..280c353d7f894f4edb85fc42638d0df23f0d3566
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_yoso_cuda_4218a9a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5a1a7d10fc43d48db0af70da0c35f63ea174b5b649858b575eb77f976870ffa
+size 3230640
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/metadata.json b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d6fa98528e8834b54b6233c7bc3038ee4b1a15
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/yoso/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_ops.py b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85a3d44eb0827816e9c75b218f92ec64b9bf0e0
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_4218a9a
+ops = torch.ops._yoso_cuda_4218a9a
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_4218a9a::{op_name}"
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_yoso_cuda_4218a9a.abi3.so b/build/torch211-cxx11-cu130-x86_64-linux/_yoso_cuda_4218a9a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..a38a74a85ab75fdddb9fa9e3537c1507dc524469
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_yoso_cuda_4218a9a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a6483bd3b77aedd6b27d70b07481398a43d1f384aad89788bf17bd575ba33f6
+size 3102168
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/metadata.json b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d6fa98528e8834b54b6233c7bc3038ee4b1a15
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/yoso/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/yoso/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/yoso/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/yoso/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b53d98292d520c36fefdf5d9d9b9f7364aa6238
Binary files /dev/null and b/build/torch27-cxx11-cu118-x86_64-linux/yoso/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/yoso/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/yoso/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55e87018d3df7b91170be0d11fd558df37e2fa0d
Binary files /dev/null and b/build/torch27-cxx11-cu118-x86_64-linux/yoso/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/yoso/_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/yoso/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..31d4d73ddbe7bbf56bfa857eabc15341bc736cf5
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/yoso/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_3173620_dirty
+ops = torch.ops._yoso_3173620_dirty
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_3173620_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..7b37e39a83702b6f6958ea5b50a32c5308faa2a9
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a993d93b5c46e4056a29a9c8a72b5499f01d6c852f6a8ee3662716dcf0bfe08a
+size 2465968
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/yoso/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/yoso/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/yoso/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d310693a241d4ef91971482dc7a397ee6ea623be
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/yoso/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/yoso/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/yoso/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86acc79b5ebbe4c27725915220edb5301f3c8c80
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/yoso/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/yoso/_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/yoso/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..31d4d73ddbe7bbf56bfa857eabc15341bc736cf5
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/yoso/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_3173620_dirty
+ops = torch.ops._yoso_3173620_dirty
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_3173620_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..99fa34c0130ae8e4330528c71c5b52baf5976cea
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:557df99aef151fadf862a138f84943dd93cb898211a527787668f191e63dc107
+size 2494960
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/yoso/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/yoso/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/yoso/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fd1746167ad7ba532596099530d204e0eda380d
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/yoso/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/yoso/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/yoso/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fe303c148bcebe1ac6c69c0257bd4bc83c00af0
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/yoso/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/yoso/_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/yoso/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..31d4d73ddbe7bbf56bfa857eabc15341bc736cf5
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/yoso/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_3173620_dirty
+ops = torch.ops._yoso_3173620_dirty
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_3173620_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..9447ed6037ad8b978dfeb2eced36bccc99cf70f8
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/yoso/_yoso_3173620_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d316cd26d90f963527aa7a68e2340545635c2201bd69c781a1040fb5ea6aa54b
+size 3045736
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..bed6afcf018349b53b90799dcb6d1d60cc5b0b70
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_dd38bd3
+ops = torch.ops._yoso_dd38bd3
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_dd38bd3::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_yoso_dd38bd3.abi3.so b/build/torch28-cxx11-cu126-x86_64-linux/_yoso_dd38bd3.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..bbb1097e224ecfdc0e7efcaa0fdedf942985f8a4
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_yoso_dd38bd3.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:283e4a04bda6c2b48edf4730f533a2f7fdf4fb94a29421a4d6fe8e44fcbf2c9a
+size 2504360
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/metadata.json b/build/torch28-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+ "version": 1,
+ "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/yoso/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..bed6afcf018349b53b90799dcb6d1d60cc5b0b70
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_dd38bd3
+ops = torch.ops._yoso_dd38bd3
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_dd38bd3::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_yoso_dd38bd3.abi3.so b/build/torch28-cxx11-cu128-x86_64-linux/_yoso_dd38bd3.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..ffd30824b1b023a03710b6db149823f1547208c8
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_yoso_dd38bd3.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4140e8f7142051a471e8a604602cc36da47fc737afcb902364aa18e553a63275
+size 3051032
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/metadata.json b/build/torch28-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+ "version": 1,
+ "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/yoso/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..bed6afcf018349b53b90799dcb6d1d60cc5b0b70
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_dd38bd3
+ops = torch.ops._yoso_dd38bd3
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_dd38bd3::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_yoso_dd38bd3.abi3.so b/build/torch28-cxx11-cu129-x86_64-linux/_yoso_dd38bd3.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..33c7edacbcd240f5fba3b3dbfae15a85d05b0148
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_yoso_dd38bd3.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99102294e1a67b3ffabb45d37104584fa8c43e8312f17dc4412b19ed98975796
+size 3060240
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/metadata.json b/build/torch28-cxx11-cu129-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+ "version": 1,
+ "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/yoso/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_ops.py b/build/torch29-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6645737e583ec43ee356a93f1e79545f61bdb9a
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_c42e4a3
+ops = torch.ops._yoso_cuda_c42e4a3
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_c42e4a3::{op_name}"
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so b/build/torch29-cxx11-cu126-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..1c98430126bb4be0552681949de3fad9fa046788
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6606f0bc558627148daf479226a35eabb3809d190db90f6234a8dcfd9d8b9f77
+size 2632008
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/metadata.json b/build/torch29-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f1f230ee52961bfaab0e071495b170786f009a9
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,12 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/yoso/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_ops.py b/build/torch29-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6645737e583ec43ee356a93f1e79545f61bdb9a
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_c42e4a3
+ops = torch.ops._yoso_cuda_c42e4a3
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_c42e4a3::{op_name}"
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so b/build/torch29-cxx11-cu126-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..ea7fc2f2e52e2fbab5a5bafde927b7907870242e
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3218cf0792bcd3ee132ebc36b67ca006c30c5dcd01813fbee47b15edd18a2fe5
+size 2504344
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/metadata.json b/build/torch29-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f1f230ee52961bfaab0e071495b170786f009a9
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,12 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/yoso/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_ops.py b/build/torch29-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6645737e583ec43ee356a93f1e79545f61bdb9a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_c42e4a3
+ops = torch.ops._yoso_cuda_c42e4a3
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_c42e4a3::{op_name}"
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so b/build/torch29-cxx11-cu128-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..84480ae3136479e75896cc4ac654199c66a879d9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eda10b449abe469d3f1c01a6ac7159b4ca1abddf19b302308cba2849b10ab33e
+size 3156424
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/metadata.json b/build/torch29-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d6fa98528e8834b54b6233c7bc3038ee4b1a15
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/yoso/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_ops.py b/build/torch29-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6645737e583ec43ee356a93f1e79545f61bdb9a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_c42e4a3
+ops = torch.ops._yoso_cuda_c42e4a3
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_c42e4a3::{op_name}"
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so b/build/torch29-cxx11-cu128-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..31ab45dcf941b1b5b8b6edd42ca7252bdbc0f89b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca3fed62c3a6e222f7bd1f3cb35413ac84357c6e8b7e64eb3382a135d40f1cce
+size 3051016
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/metadata.json b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d6fa98528e8834b54b6233c7bc3038ee4b1a15
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/yoso/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_ops.py b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85a3d44eb0827816e9c75b218f92ec64b9bf0e0
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_4218a9a
+ops = torch.ops._yoso_cuda_4218a9a
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_4218a9a::{op_name}"
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_yoso_cuda_4218a9a.abi3.so b/build/torch29-cxx11-cu129-aarch64-linux/_yoso_cuda_4218a9a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..9c01bb52c4b8aed04472e05d1e36131cd3901992
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_yoso_cuda_4218a9a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cdef02f3da39ee5c04897a6c86c75ca947b5d193bcd126ecc71a3d2c51a08f85
+size 3157848
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/metadata.json b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d6fa98528e8834b54b6233c7bc3038ee4b1a15
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/yoso/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_ops.py b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85a3d44eb0827816e9c75b218f92ec64b9bf0e0
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_4218a9a
+ops = torch.ops._yoso_cuda_4218a9a
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_4218a9a::{op_name}"
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_yoso_cuda_4218a9a.abi3.so b/build/torch29-cxx11-cu129-x86_64-linux/_yoso_cuda_4218a9a.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..dcd9803f7dc11b9d715ebb64c77cb657a396456f
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_yoso_cuda_4218a9a.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eefc22e760058e5ffc68c84ad7d6b0ceef825ef8ee251ec76a367f582e8a688
+size 3060224
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/metadata.json b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d6fa98528e8834b54b6233c7bc3038ee4b1a15
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/yoso/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_ops.py b/build/torch29-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6645737e583ec43ee356a93f1e79545f61bdb9a
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_c42e4a3
+ops = torch.ops._yoso_cuda_c42e4a3
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_c42e4a3::{op_name}"
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so b/build/torch29-cxx11-cu130-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..79b405db4cdf7a229b72d84c4d201f59947320f6
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/_yoso_cuda_c42e4a3.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c80457bb97fa77304a20b3c6ed1564c929b500be9f51e755a3982358630cf4aa
+size 3227968
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/metadata.json b/build/torch29-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d6fa98528e8834b54b6233c7bc3038ee4b1a15
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/yoso/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_ops.py b/build/torch29-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6645737e583ec43ee356a93f1e79545f61bdb9a
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _yoso_cuda_c42e4a3
+ops = torch.ops._yoso_cuda_c42e4a3
+
+def add_op_namespace_prefix(op_name: str):
+ """
+ Prefix op by namespace.
+ """
+ return f"_yoso_cuda_c42e4a3::{op_name}"
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so b/build/torch29-cxx11-cu130-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..655249ace76549a73c6c714f94e81a8f5ffff901
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_yoso_cuda_c42e4a3.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:540d82d8ef48e433133098bffb6d6afd8c210f2a1b76ab79721f47558a0b450a
+size 3104168
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/metadata.json b/build/torch29-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..20d6fa98528e8834b54b6233c7bc3038ee4b1a15
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,14 @@
+{
+ "version": 1,
+ "python-depends": [],
+ "backend": {
+ "type": "cuda",
+ "archs": [
+ "10.0",
+ "12.0",
+ "8.0",
+ "8.9",
+ "9.0"
+ ]
+ }
+}
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/yoso/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/yoso/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
+ # it would also be used for other imports. So, we make a module name that
+ # depends on the path for it to be unique using the hex-encoded hash of
+ # the path.
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+ module_name = path_hash
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
+ if spec is None:
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+ module = importlib.util.module_from_spec(spec)
+ if module is None:
+ raise ImportError(f"Cannot load module {module_name} from spec")
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module) # type: ignore
+ return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 0000000000000000000000000000000000000000..85b5d60a855bf4c19555cc9b8de8ca88d6fd3ae9
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,168 @@
+{
+ "nodes": {
+ "flake-compat": {
+ "locked": {
+ "lastModified": 1747046372,
+ "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+ "owner": "edolstra",
+ "repo": "flake-compat",
+ "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+ "type": "github"
+ },
+ "original": {
+ "owner": "edolstra",
+ "repo": "flake-compat",
+ "type": "github"
+ }
+ },
+ "flake-compat_2": {
+ "locked": {
+ "lastModified": 1747046372,
+ "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+ "owner": "edolstra",
+ "repo": "flake-compat",
+ "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+ "type": "github"
+ },
+ "original": {
+ "owner": "edolstra",
+ "repo": "flake-compat",
+ "type": "github"
+ }
+ },
+ "flake-utils": {
+ "inputs": {
+ "systems": "systems"
+ },
+ "locked": {
+ "lastModified": 1731533236,
+ "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+ "type": "github"
+ },
+ "original": {
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "type": "github"
+ }
+ },
+ "flake-utils_2": {
+ "inputs": {
+ "systems": "systems_2"
+ },
+ "locked": {
+ "lastModified": 1731533236,
+ "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+ "type": "github"
+ },
+ "original": {
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "type": "github"
+ }
+ },
+ "hf-nix": {
+ "inputs": {
+ "flake-compat": "flake-compat_2",
+ "flake-utils": "flake-utils_2",
+ "nixpkgs": "nixpkgs"
+ },
+ "locked": {
+ "lastModified": 1759851564,
+ "narHash": "sha256-Xybkhm0FM/VzlZ5WndTYq/X/9MAeddd4EQ2Vz8GdkOA=",
+ "owner": "huggingface",
+ "repo": "hf-nix",
+ "rev": "351655d9f124805ed7c1193aa61550ce245f4570",
+ "type": "github"
+ },
+ "original": {
+ "owner": "huggingface",
+ "repo": "hf-nix",
+ "type": "github"
+ }
+ },
+ "kernel-builder": {
+ "inputs": {
+ "flake-compat": "flake-compat",
+ "flake-utils": "flake-utils",
+ "hf-nix": "hf-nix",
+ "nixpkgs": [
+ "kernel-builder",
+ "hf-nix",
+ "nixpkgs"
+ ]
+ },
+ "locked": {
+ "lastModified": 1760035358,
+ "narHash": "sha256-N5vmCrgwcIluPclf/hmnofLK77EJJYh5PR8SRvw++es=",
+ "owner": "huggingface",
+ "repo": "kernel-builder",
+ "rev": "a48cbd19ae7e425dfc1865188ef06dac43ab9244",
+ "type": "github"
+ },
+ "original": {
+ "owner": "huggingface",
+ "repo": "kernel-builder",
+ "type": "github"
+ }
+ },
+ "nixpkgs": {
+ "locked": {
+ "lastModified": 1755963616,
+ "narHash": "sha256-6yD0ww/S8n+U2uPYcJZ3DRURP8Kx036GRpR2uPNZroE=",
+ "owner": "nixos",
+ "repo": "nixpkgs",
+ "rev": "73e96df7cff5783f45e21342a75a1540c4eddce4",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nixos",
+ "ref": "nixos-unstable-small",
+ "repo": "nixpkgs",
+ "type": "github"
+ }
+ },
+ "root": {
+ "inputs": {
+ "kernel-builder": "kernel-builder"
+ }
+ },
+ "systems": {
+ "locked": {
+ "lastModified": 1681028828,
+ "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+ "owner": "nix-systems",
+ "repo": "default",
+ "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nix-systems",
+ "repo": "default",
+ "type": "github"
+ }
+ },
+ "systems_2": {
+ "locked": {
+ "lastModified": 1681028828,
+ "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+ "owner": "nix-systems",
+ "repo": "default",
+ "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nix-systems",
+ "repo": "default",
+ "type": "github"
+ }
+ }
+ },
+ "root": "root",
+ "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 0000000000000000000000000000000000000000..79fe7dccf689ad010c579fd88910f0425c6cc341
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,17 @@
+{
+ description = "Flake for yoso kernels";
+
+ inputs = {
+ kernel-builder.url = "github:huggingface/kernel-builder/";
+ };
+
+ outputs =
+ {
+ self,
+ kernel-builder,
+ }:
+ kernel-builder.lib.genFlakeOutputs {
+ path = ./.;
+ rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+ };
+}
diff --git a/media/benches_dark_animation.svg b/media/benches_dark_animation.svg
new file mode 100644
index 0000000000000000000000000000000000000000..0479109ef499c1455b74fb4e0f676950a067d916
--- /dev/null
+++ b/media/benches_dark_animation.svg
@@ -0,0 +1,33 @@
+
\ No newline at end of file
diff --git a/media/benches_dark_latency.svg b/media/benches_dark_latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..e08f1994db4bd5271c932e9f086b8b05313e1a7e
--- /dev/null
+++ b/media/benches_dark_latency.svg
@@ -0,0 +1,1940 @@
+
+
+
diff --git a/media/benches_dark_throughput.svg b/media/benches_dark_throughput.svg
new file mode 100644
index 0000000000000000000000000000000000000000..fa2c1b3ef7944baaed930b63646d61a2c0e1cec7
--- /dev/null
+++ b/media/benches_dark_throughput.svg
@@ -0,0 +1,2018 @@
+
+
+
diff --git a/media/benches_light_animation.svg b/media/benches_light_animation.svg
new file mode 100644
index 0000000000000000000000000000000000000000..86c0e541192531b68438531cd976d6effdd8f3a8
--- /dev/null
+++ b/media/benches_light_animation.svg
@@ -0,0 +1,33 @@
+
\ No newline at end of file
diff --git a/media/benches_light_latency.svg b/media/benches_light_latency.svg
new file mode 100644
index 0000000000000000000000000000000000000000..57defab8bb09b8504dbd27bd4735f4bd87a97fb0
--- /dev/null
+++ b/media/benches_light_latency.svg
@@ -0,0 +1,1940 @@
+
+
+
diff --git a/media/benches_light_throughput.svg b/media/benches_light_throughput.svg
new file mode 100644
index 0000000000000000000000000000000000000000..bbcc276059e4135822b15a247807986561b23a28
--- /dev/null
+++ b/media/benches_light_throughput.svg
@@ -0,0 +1,2018 @@
+
+
+
diff --git a/torch-ext/torch_binding.cpp b/torch-ext/torch_binding.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bf8ae100ef95d2cf782a2528c45e873de8d00e38
--- /dev/null
+++ b/torch-ext/torch_binding.cpp
@@ -0,0 +1,87 @@
+#include
+#include
+
+#include "registration.h"
+#include "torch_binding.h"
+
+at::Tensor lsh_cumulation_wrapper(
+ at::Tensor query_mask, // [batch_size, num_query]
+ at::Tensor query_hash_code, // [batch_size, num_query, num_hash_f]
+ at::Tensor key_mask, // [batch_size, num_key]
+ at::Tensor key_hash_code, // [batch_size, num_key, num_hash_f]
+ at::Tensor value, // [batch_size, num_key, value_dim]
+ int64_t hashtable_capacity,
+ bool use_cuda,
+ int64_t version
+) {
+ return lsh_cumulation(
+ query_mask,
+ query_hash_code,
+ key_mask,
+ key_hash_code,
+ value,
+ static_cast(hashtable_capacity),
+ use_cuda,
+ static_cast(version)
+ );
+}
+
+std::vector fast_hash_wrapper(
+ at::Tensor query_mask,
+ at::Tensor query_vector,
+ at::Tensor key_mask,
+ at::Tensor key_vector,
+ int64_t num_hash_f,
+ int64_t hash_code_len,
+ bool use_cuda,
+ int64_t version
+) {
+ return fast_hash(
+ query_mask,
+ query_vector,
+ key_mask,
+ key_vector,
+ static_cast(num_hash_f),
+ static_cast(hash_code_len),
+ use_cuda,
+ static_cast(version)
+ );
+}
+
+at::Tensor lsh_weighted_cumulation_wrapper(
+ at::Tensor query_mask, // [batch_size, num_query]
+ at::Tensor query_hash_code, // [batch_size, num_query, num_hash_f]
+ at::Tensor query_weight, // [batch_size, num_query, weight_dim]
+ at::Tensor key_mask, // [batch_size, num_key]
+ at::Tensor key_hash_code, // [batch_size, num_key, num_hash_f]
+ at::Tensor key_weight, // [batch_size, num_key, weight_dim]
+ at::Tensor value, // [batch_size, num_key, value_dim]
+ int64_t hashtable_capacity,
+ bool use_cuda,
+ int64_t version
+) {
+ return lsh_weighted_cumulation(
+ query_mask,
+ query_hash_code,
+ query_weight,
+ key_mask,
+ key_hash_code,
+ key_weight,
+ value,
+ static_cast(hashtable_capacity),
+ use_cuda,
+ static_cast(version)
+ );
+}
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+ ops.def("lsh_cumulation(Tensor query_mask, Tensor query_hash_code, Tensor key_mask, Tensor key_hash_code, Tensor value, int hashtable_capacity, bool use_cuda, int version) -> Tensor");
+ ops.impl("lsh_cumulation", torch::kCUDA, &lsh_cumulation_wrapper);
+
+ ops.def("fast_hash(Tensor query_mask, Tensor query_vector, Tensor key_mask, Tensor key_vector, int num_hash_f, int hash_code_len, bool use_cuda, int version) -> Tensor[]");
+ ops.impl("fast_hash", torch::kCUDA, &fast_hash_wrapper);
+
+ ops.def("lsh_weighted_cumulation(Tensor query_mask, Tensor query_hash_code, Tensor query_weight, Tensor key_mask, Tensor key_hash_code, Tensor key_weight, Tensor value, int hashtable_capacity, bool use_cuda, int version) -> Tensor");
+ ops.impl("lsh_weighted_cumulation", torch::kCUDA, &lsh_weighted_cumulation_wrapper);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
\ No newline at end of file
diff --git a/torch-ext/torch_binding.h b/torch-ext/torch_binding.h
new file mode 100644
index 0000000000000000000000000000000000000000..add45f31559e1a60c872caf82787f2d3bc4e0544
--- /dev/null
+++ b/torch-ext/torch_binding.h
@@ -0,0 +1,38 @@
+#include
+#include
+#include
+
+at::Tensor lsh_cumulation(
+ at::Tensor query_mask, // [batch_size, num_query]
+ at::Tensor query_hash_code, // [batch_size, num_query, num_hash_f]
+ at::Tensor key_mask, // [batch_size, num_key]
+ at::Tensor key_hash_code, // [batch_size, num_key, num_hash_f]
+ at::Tensor value, // [batch_size, num_key, value_dim]
+ int hashtable_capacity,
+ bool use_cuda,
+ int version
+);
+
+std::vector fast_hash(
+ at::Tensor query_mask,
+ at::Tensor query_vector,
+ at::Tensor key_mask,
+ at::Tensor key_vector,
+ int num_hash_f,
+ int hash_code_len,
+ bool use_cuda,
+ int version
+);
+
+at::Tensor lsh_weighted_cumulation(
+ at::Tensor query_mask, // [batch_size, num_query]
+ at::Tensor query_hash_code, // [batch_size, num_query, num_hash_f]
+ at::Tensor query_weight, // [batch_size, num_query, weight_dim]
+ at::Tensor key_mask, // [batch_size, num_key]
+ at::Tensor key_hash_code, // [batch_size, num_key, num_hash_f]
+ at::Tensor key_weight, // [batch_size, num_key, weight_dim]
+ at::Tensor value, // [batch_size, num_key, value_dim]
+ int hashtable_capacity,
+ bool use_cuda,
+ int version
+);
\ No newline at end of file
diff --git a/torch-ext/yoso/__init__.py b/torch-ext/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1738b47b313effeb7568d78344fdfd30fefca3e
--- /dev/null
+++ b/torch-ext/yoso/__init__.py
@@ -0,0 +1,11 @@
+from ._ops import ops
+
+fast_hash = ops.fast_hash
+lsh_cumulation = ops.lsh_cumulation
+lsh_weighted_cumulation = ops.lsh_weighted_cumulation
+
+__all__ = [
+ "fast_hash",
+ "lsh_cumulation",
+ "lsh_weighted_cumulation",
+]
\ No newline at end of file
diff --git a/yoso/common.h b/yoso/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5085c88dd3ea9a12eec264a8c48946bf2b80b23
--- /dev/null
+++ b/yoso/common.h
@@ -0,0 +1,10 @@
+
+#define min(a, b) ((a)<(b)?(a):(b))
+#define max(a, b) ((a)>(b)?(a):(b))
+#define ceil_divide(a, b) ((a)/(b)+((a)%(b)!=0))
+#define select(cond, a, b) ((cond)?(a):(b))
+#define PI 3.141592
+#define EPSILON 1e-8
+#define MAX_VAL 1e12
+#define MIN_VAL -1e12
+#define EMPTY_VALUE -1
diff --git a/yoso/common_cuda.h b/yoso/common_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..97030870649a2fdac58cb26cf966e8f5c8cc7909
--- /dev/null
+++ b/yoso/common_cuda.h
@@ -0,0 +1,9 @@
+
+#define MAX_THREADS_PER_BLOCK 1024
+#define OPTIMAL_THREADS_PER_BLOCK 256
+#define WARP_SIZE 32
+#define MAX_NUM_BLOCK_X 2147483647
+#define MAX_NUM_BLOCK_Y 65535
+#define MAX_NUM_BLOCK_Z 65535
+#define MAX_SHARED_MEM_PER_BLOCK 48000
+#define FULL_MASK 0xffffffff
diff --git a/yoso/common_cuda_device.h b/yoso/common_cuda_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..6674f93afdc25ab35c5d83881d00028bcf2989fc
--- /dev/null
+++ b/yoso/common_cuda_device.h
@@ -0,0 +1,79 @@
+
+#include "common.h"
+
+template
+__device__ int set_insert(T *set, int set_size, T value) {
+ int slot = value % set_size;
+ int start_slot = slot;
+ while (true) {
+ T prev = atomicCAS(&set[slot], EMPTY_VALUE, value);
+ if (prev == EMPTY_VALUE || prev == value) {
+ return slot;
+ }
+ slot = (slot + 1) % set_size;
+ if (slot == start_slot) {
+ return -1;
+ }
+ }
+ return -1;
+}
+
+template
+__device__ int set_lookup(T *set, int set_size, T value) {
+ int slot = value % set_size;
+ int start_slot = slot;
+ while (true) {
+ if (set[slot] == value) {
+ return slot;
+ }
+ slot = (slot + 1) % set_size;
+ if (slot == start_slot) {
+ return -1;
+ }
+ }
+ return -1;
+}
+
+template
+__device__ void init_buffer(T init_value, T *buffer, int buffer_size, int num_threads, int thread_id) {
+ __syncthreads();
+ for (int i = 0; i < buffer_size; i = i + num_threads) {
+ int offset_idx = i + thread_id;
+ if (offset_idx < buffer_size) {
+ buffer[offset_idx] = init_value;
+ }
+ }
+ __syncthreads();
+}
+
+template
+__device__ void copy_data(T *src_pt, T *dist_pt, int data_length, int num_threads, int thread_id) {
+ __syncthreads();
+ for (int i = 0; i < data_length; i = i + num_threads) {
+ int offset_idx = i + thread_id;
+ if (offset_idx < data_length) {
+ dist_pt[offset_idx] = src_pt[offset_idx];
+ }
+ }
+ __syncthreads();
+}
+
+template
+__device__ void init_buffer_nonblocking(T init_value, T *buffer, int buffer_size, int num_threads, int thread_id) {
+ for (int i = 0; i < buffer_size; i = i + num_threads) {
+ int offset_idx = i + thread_id;
+ if (offset_idx < buffer_size) {
+ buffer[offset_idx] = init_value;
+ }
+ }
+}
+
+template
+__device__ void copy_data_nonblocking(T *src_pt, T *dist_pt, int data_length, int num_threads, int thread_id) {
+ for (int i = 0; i < data_length; i = i + num_threads) {
+ int offset_idx = i + thread_id;
+ if (offset_idx < data_length) {
+ dist_pt[offset_idx] = src_pt[offset_idx];
+ }
+ }
+}
diff --git a/yoso/fast_lsh_cumulation.cu b/yoso/fast_lsh_cumulation.cu
new file mode 100644
index 0000000000000000000000000000000000000000..83c28decb854848b94bf2b6aff18464858f99156
--- /dev/null
+++ b/yoso/fast_lsh_cumulation.cu
@@ -0,0 +1,588 @@
+// File from https://github.com/mlpen/YOSO/blob/main/encoders/backbones/efficient_attentions/yoso/yoso_v1/cuda/fast_lsh_cumulation.cu
+
+#include
+#include
+#include "fast_lsh_cumulation.h"
+#include "fast_lsh_cumulation_cuda.h"
+#include "common_cuda.h"
+#include "common.h"
+#include
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+std::vector fast_hash_ver1_kernel(
+ at::Tensor query_mask,
+ at::Tensor query_vector,
+ at::Tensor key_mask,
+ at::Tensor key_vector,
+ int num_hash_f,
+ int hash_code_len,
+ bool use_cuda
+) {
+
+ int batch_size = query_vector.size(0);
+ int num_query = query_vector.size(1);
+ int num_key = key_vector.size(1);
+ int vector_dim = query_vector.size(2);
+
+ int num_hash_per_part = vector_dim / hash_code_len;
+ int num_part = max(1, ceil_divide(num_hash_f, num_hash_per_part));
+
+ at::Tensor Dmat = 2 * at::randint(0, 2, {batch_size, 3, num_part, vector_dim}, query_mask.options()) - 1;
+ at::Tensor query_hash_code = at::zeros({batch_size, num_query, num_hash_f}, query_mask.options());
+ at::Tensor key_hash_code = at::zeros({batch_size, num_key, num_hash_f}, key_mask.options());
+
+ int *query_mask_ptr = query_mask.data_ptr();
+ float *query_vector_ptr = query_vector.data_ptr();
+ int *key_mask_ptr = key_mask.data_ptr();
+ float *key_vector_ptr = key_vector.data_ptr();
+
+ int *Dmat_ptr = Dmat.data_ptr();
+
+ int *query_hash_code_ptr = query_hash_code.data_ptr();
+ int *key_hash_code_ptr = key_hash_code.data_ptr();
+
+ if (use_cuda) {
+ {
+ dim3 threads(vector_dim);
+ dim3 blocks(num_part, num_query, batch_size);
+ int shared_mem = vector_dim * sizeof(float);
+ fast_hash_ver1_cuda_kernel<<>>(
+ query_mask_ptr,
+ query_vector_ptr,
+ Dmat_ptr,
+ query_hash_code_ptr,
+ batch_size,
+ num_query,
+ vector_dim,
+ num_part,
+ num_hash_f,
+ hash_code_len
+ );
+ }
+ {
+ dim3 threads(vector_dim);
+ dim3 blocks(num_part, num_key, batch_size);
+ int shared_mem = vector_dim * sizeof(float);
+ fast_hash_ver1_cuda_kernel<<>>(
+ key_mask_ptr,
+ key_vector_ptr,
+ Dmat_ptr,
+ key_hash_code_ptr,
+ batch_size,
+ num_key,
+ vector_dim,
+ num_part,
+ num_hash_f,
+ hash_code_len
+ );
+ }
+ }
+
+ return {query_hash_code, key_hash_code};
+
+}
+
+at::Tensor lsh_cumulation_ver1_kernel(
+ at::Tensor query_mask,
+ at::Tensor query_hash_code,
+ at::Tensor key_mask,
+ at::Tensor key_hash_code,
+ at::Tensor value,
+ int hashtable_capacity,
+ bool use_cuda
+) {
+
+ int batch_size = query_hash_code.size(0);
+ int num_hash_f = query_hash_code.size(2);
+
+ int num_query = query_hash_code.size(1);
+ int num_key = key_hash_code.size(1);
+ int value_dim = value.size(2);
+
+ at::Tensor hashtable_value = at::empty({batch_size, num_hash_f, hashtable_capacity, WARP_SIZE}, value.options());
+ at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options());
+
+ if (use_cuda) {
+ int threads_x = WARP_SIZE;
+ int threads_y = OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE;
+ int block_x_step1 = num_key / threads_y;
+ int block_x_step2 = num_query / threads_y;
+ int block_y = batch_size;
+
+ dim3 threads(threads_x, threads_y);
+ dim3 blocks_step1(block_x_step1, block_y);
+ dim3 blocks_step2(block_x_step2, block_y);
+
+ int *query_mask_ptr = query_mask.data_ptr();
+ int *query_hash_code_ptr = query_hash_code.data_ptr();
+ int *key_mask_ptr = key_mask.data_ptr();
+ int *key_hash_code_ptr = key_hash_code.data_ptr();
+ float *value_ptr = value.data_ptr();
+ float *hashtable_value_ptr = hashtable_value.data_ptr();
+ float *cumulation_value_ptr = cumulation_value.data_ptr();
+
+ for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) {
+
+ cudaMemset(hashtable_value_ptr, 0, (batch_size * num_hash_f * hashtable_capacity * WARP_SIZE) * sizeof(float));
+
+ lsh_cumulation_ver1_step1_cuda_kernel<<>>(
+ key_mask_ptr,
+ key_hash_code_ptr,
+ value_ptr,
+ hashtable_value_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity,
+ num_key,
+ value_dim,
+ value_offset
+ );
+
+ lsh_cumulation_ver1_step2_cuda_kernel<<>>(
+ query_mask_ptr,
+ query_hash_code_ptr,
+ hashtable_value_ptr,
+ cumulation_value_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity,
+ num_query,
+ value_dim,
+ value_offset
+ );
+ }
+
+ }
+
+ return cumulation_value;
+
+}
+
+at::Tensor lsh_weighted_cumulation_ver1_kernel(
+ at::Tensor query_mask,
+ at::Tensor query_hash_code,
+ at::Tensor query_weight,
+ at::Tensor key_mask,
+ at::Tensor key_hash_code,
+ at::Tensor key_weight,
+ at::Tensor value,
+ int hashtable_capacity,
+ bool use_cuda
+) {
+
+ int batch_size = query_hash_code.size(0);
+ int num_hash_f = query_hash_code.size(2);
+
+ int num_query = query_hash_code.size(1);
+ int num_key = key_hash_code.size(1);
+ int value_dim = value.size(2);
+ int weight_dim = query_weight.size(2);
+
+ at::Tensor hashtable_value = at::zeros({batch_size, num_hash_f, hashtable_capacity, WARP_SIZE}, value.options());
+ at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options());
+
+ if (use_cuda) {
+ int threads_x = WARP_SIZE;
+ int threads_y = OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE;
+ int block_x_step1 = num_key / threads_y;
+ int block_x_step2 = num_query / threads_y;
+ int block_y = batch_size;
+
+ dim3 threads(threads_x, threads_y);
+ dim3 blocks_step1(block_x_step1, block_y);
+ dim3 blocks_step2(block_x_step2, block_y);
+
+ int *query_mask_ptr = query_mask.data_ptr();
+ int *query_hash_code_ptr = query_hash_code.data_ptr();
+ float *query_weight_ptr = query_weight.data_ptr();
+ int *key_mask_ptr = key_mask.data_ptr();
+ int *key_hash_code_ptr = key_hash_code.data_ptr();
+ float *key_weight_ptr = key_weight.data_ptr();
+ float *value_ptr = value.data_ptr();
+ float *hashtable_value_ptr = hashtable_value.data_ptr();
+ float *cumulation_value_ptr = cumulation_value.data_ptr();
+
+ for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) {
+ for (int weight_idx = 0; weight_idx < weight_dim; weight_idx++) {
+
+ cudaMemset(hashtable_value_ptr, 0, (batch_size * num_hash_f * hashtable_capacity * WARP_SIZE) * sizeof(float));
+
+ lsh_weighted_cumulation_ver1_step1_cuda_kernel<<>>(
+ key_mask_ptr,
+ key_hash_code_ptr,
+ key_weight_ptr,
+ value_ptr,
+ hashtable_value_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity,
+ num_key,
+ value_dim,
+ weight_dim,
+ value_offset,
+ weight_idx
+ );
+
+ lsh_weighted_cumulation_ver1_step2_cuda_kernel<<>>(
+ query_mask_ptr,
+ query_hash_code_ptr,
+ query_weight_ptr,
+ hashtable_value_ptr,
+ cumulation_value_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity,
+ num_query,
+ value_dim,
+ weight_dim,
+ value_offset,
+ weight_idx
+ );
+ }
+ }
+
+ }
+
+ return cumulation_value;
+
+}
+
+at::Tensor lsh_weighted_cumulation_ver2_kernel(
+ at::Tensor query_mask,
+ at::Tensor query_hash_code,
+ at::Tensor query_weight,
+ at::Tensor key_mask,
+ at::Tensor key_hash_code,
+ at::Tensor key_weight,
+ at::Tensor value,
+ int hashtable_capacity,
+ bool use_cuda
+) {
+
+ int batch_size = query_hash_code.size(0);
+ int num_hash_f = query_hash_code.size(2);
+
+ int num_query = query_hash_code.size(1);
+ int num_key = key_hash_code.size(1);
+ int value_dim = value.size(2);
+ int weight_dim = query_weight.size(2);
+
+ at::Tensor count_sort_table = at::zeros({batch_size, num_hash_f, hashtable_capacity}, query_hash_code.options());
+ at::Tensor key_sorted_idxes = at::zeros({batch_size, num_hash_f, num_key}, query_hash_code.options());
+ at::Tensor query_info = at::zeros({batch_size, num_query, 2, num_hash_f}, query_hash_code.options());
+ at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options());
+
+ if (use_cuda) {
+
+ int *query_mask_ptr = query_mask.data_ptr();
+ int *query_hash_code_ptr = query_hash_code.data_ptr();
+ float *query_weight_ptr = query_weight.data_ptr();
+ int *key_mask_ptr = key_mask.data_ptr();
+ int *key_hash_code_ptr = key_hash_code.data_ptr();
+ float *key_weight_ptr = key_weight.data_ptr();
+ float *value_ptr = value.data_ptr();
+
+ int *count_sort_table_ptr = count_sort_table.data_ptr();
+ int *key_sorted_idxes_ptr = key_sorted_idxes.data_ptr();
+ int *query_info_ptr = query_info.data_ptr();
+
+ float *cumulation_value_ptr = cumulation_value.data_ptr();
+
+ {
+ dim3 threads_step13(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f));
+ dim3 blocks_step13(num_key / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size);
+ dim3 threads_step2(min(hashtable_capacity, OPTIMAL_THREADS_PER_BLOCK));
+ dim3 blocks_step2(num_hash_f, batch_size);
+ int shared_mem = hashtable_capacity * sizeof(float);
+ count_sort_step1_cuda_kernel<<>>(
+ key_mask_ptr,
+ key_hash_code_ptr,
+ count_sort_table_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity,
+ num_key
+ );
+ count_sort_step2_cuda_kernel<<>>(
+ count_sort_table_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity
+ );
+ count_sort_step3_cuda_kernel<<>>(
+ key_mask_ptr,
+ key_hash_code_ptr,
+ count_sort_table_ptr,
+ key_sorted_idxes_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity,
+ num_key
+ );
+ }
+ {
+ dim3 threads(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f));
+ dim3 blocks(num_query / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size);
+ extract_query_info_cuda_kernel<<>>(
+ query_mask_ptr,
+ query_hash_code_ptr,
+ count_sort_table_ptr,
+ query_info_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity,
+ num_query
+ );
+ }
+ {
+ dim3 threads(WARP_SIZE, OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE);
+ dim3 blocks(num_query, num_hash_f, batch_size);
+ int shared_mem = (weight_dim + WARP_SIZE) * sizeof(float);
+ lsh_weighted_cumulation_ver2_step2_cuda_kernel<<>>(
+ query_mask_ptr,
+ query_info_ptr,
+ key_sorted_idxes_ptr,
+ query_weight_ptr,
+ key_weight_ptr,
+ value_ptr,
+ cumulation_value_ptr,
+ batch_size,
+ num_hash_f,
+ num_query,
+ num_key,
+ value_dim,
+ weight_dim
+ );
+ }
+ }
+
+ return cumulation_value;
+
+}
+
+at::Tensor lsh_weighted_cumulation_ver3_kernel(
+ at::Tensor query_mask,
+ at::Tensor query_hash_code,
+ at::Tensor query_weight,
+ at::Tensor key_mask,
+ at::Tensor key_hash_code,
+ at::Tensor key_weight,
+ at::Tensor value,
+ int hashtable_capacity,
+ bool use_cuda
+) {
+
+ int batch_size = query_hash_code.size(0);
+ int num_hash_f = query_hash_code.size(2);
+
+ int num_query = query_hash_code.size(1);
+ int num_key = key_hash_code.size(1);
+ int value_dim = value.size(2);
+ int weight_dim = query_weight.size(2);
+
+ at::Tensor count_sort_table = at::zeros({batch_size, num_hash_f, hashtable_capacity}, query_hash_code.options());
+ at::Tensor query_sorted_idxes = at::zeros({batch_size, num_hash_f, num_query}, query_hash_code.options());
+ at::Tensor key_info = at::zeros({batch_size, num_key, 2, num_hash_f}, query_hash_code.options());
+ at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options());
+
+ if (use_cuda) {
+
+ int *query_mask_ptr = query_mask.data_ptr();
+ int *query_hash_code_ptr = query_hash_code.data_ptr();
+ float *query_weight_ptr = query_weight.data_ptr();
+ int *key_mask_ptr = key_mask.data_ptr();
+ int *key_hash_code_ptr = key_hash_code.data_ptr();
+ float *key_weight_ptr = key_weight.data_ptr();
+ float *value_ptr = value.data_ptr();
+
+ int *count_sort_table_ptr = count_sort_table.data_ptr();
+ int *query_sorted_idxes_ptr = query_sorted_idxes.data_ptr();
+ int *key_info_ptr = key_info.data_ptr();
+
+ float *cumulation_value_ptr = cumulation_value.data_ptr();
+
+ {
+ dim3 threads_step13(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f));
+ dim3 blocks_step13(num_query / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size);
+ dim3 threads_step2(min(hashtable_capacity, OPTIMAL_THREADS_PER_BLOCK));
+ dim3 blocks_step2(num_hash_f, batch_size);
+ int shared_mem = hashtable_capacity * sizeof(float);
+ count_sort_step1_cuda_kernel<<>>(
+ query_mask_ptr,
+ query_hash_code_ptr,
+ count_sort_table_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity,
+ num_query
+ );
+ count_sort_step2_cuda_kernel<<>>(
+ count_sort_table_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity
+ );
+ count_sort_step3_cuda_kernel<<>>(
+ query_mask_ptr,
+ query_hash_code_ptr,
+ count_sort_table_ptr,
+ query_sorted_idxes_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity,
+ num_query
+ );
+ }
+ {
+ dim3 threads(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f));
+ dim3 blocks(num_key / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size);
+ extract_query_info_cuda_kernel<<>>(
+ key_mask_ptr,
+ key_hash_code_ptr,
+ count_sort_table_ptr,
+ key_info_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity,
+ num_key
+ );
+ }
+ {
+ dim3 threads(WARP_SIZE, OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE);
+ dim3 blocks(num_key, num_hash_f, batch_size);
+ int shared_mem = (weight_dim + value_dim + WARP_SIZE) * sizeof(float);
+ lsh_weighted_cumulation_ver3_step2_cuda_kernel<<>>(
+ query_sorted_idxes_ptr,
+ key_mask_ptr,
+ key_info_ptr,
+ query_weight_ptr,
+ key_weight_ptr,
+ value_ptr,
+ cumulation_value_ptr,
+ batch_size,
+ num_hash_f,
+ num_query,
+ num_key,
+ value_dim,
+ weight_dim
+ );
+ }
+ }
+
+ return cumulation_value;
+
+}
+
+at::Tensor lsh_weighted_cumulation_ver4_kernel(
+ at::Tensor query_mask,
+ at::Tensor query_hash_code,
+ at::Tensor query_weight,
+ at::Tensor key_mask,
+ at::Tensor key_hash_code,
+ at::Tensor key_weight,
+ at::Tensor value,
+ int hashtable_capacity,
+ bool use_cuda
+) {
+
+ int batch_size = query_hash_code.size(0);
+ int num_hash_f = query_hash_code.size(2);
+
+ int num_query = query_hash_code.size(1);
+ int num_key = key_hash_code.size(1);
+ int value_dim = value.size(2);
+ int weight_dim = query_weight.size(2);
+
+ at::Tensor count_sort_table = at::zeros({batch_size, num_hash_f, hashtable_capacity}, query_hash_code.options());
+ at::Tensor query_sorted_idxes = at::zeros({batch_size, num_hash_f, num_query}, query_hash_code.options());
+ at::Tensor key_info = at::zeros({batch_size, num_key, 2, num_hash_f}, query_hash_code.options());
+ at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options());
+
+ if (use_cuda) {
+
+ int *query_mask_ptr = query_mask.data_ptr();
+ int *query_hash_code_ptr = query_hash_code.data_ptr();
+ float *query_weight_ptr = query_weight.data_ptr();
+ int *key_mask_ptr = key_mask.data_ptr();
+ int *key_hash_code_ptr = key_hash_code.data_ptr();
+ float *key_weight_ptr = key_weight.data_ptr();
+ float *value_ptr = value.data_ptr();
+
+ int *count_sort_table_ptr = count_sort_table.data_ptr();
+ int *query_sorted_idxes_ptr = query_sorted_idxes.data_ptr();
+ int *key_info_ptr = key_info.data_ptr();
+
+ float *cumulation_value_ptr = cumulation_value.data_ptr();
+
+ {
+ dim3 threads_step13(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f));
+ dim3 blocks_step13(num_query / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size);
+ dim3 threads_step2(min(hashtable_capacity, OPTIMAL_THREADS_PER_BLOCK));
+ dim3 blocks_step2(num_hash_f, batch_size);
+ int shared_mem = hashtable_capacity * sizeof(float);
+ count_sort_step1_cuda_kernel<<>>(
+ query_mask_ptr,
+ query_hash_code_ptr,
+ count_sort_table_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity,
+ num_query
+ );
+ count_sort_step2_cuda_kernel<<>>(
+ count_sort_table_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity
+ );
+ count_sort_step3_cuda_kernel<<>>(
+ query_mask_ptr,
+ query_hash_code_ptr,
+ count_sort_table_ptr,
+ query_sorted_idxes_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity,
+ num_query
+ );
+ }
+ {
+ dim3 threads(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f));
+ dim3 blocks(num_key / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size);
+ extract_query_info_cuda_kernel<<>>(
+ key_mask_ptr,
+ key_hash_code_ptr,
+ count_sort_table_ptr,
+ key_info_ptr,
+ batch_size,
+ num_hash_f,
+ hashtable_capacity,
+ num_key
+ );
+ }
+ {
+ dim3 threads(WARP_SIZE, OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE);
+ dim3 blocks(num_key, batch_size);
+ int shared_mem = (weight_dim + value_dim + 2 * num_hash_f) * sizeof(float);
+ lsh_weighted_cumulation_ver4_step2_cuda_kernel<<>>(
+ query_sorted_idxes_ptr,
+ key_mask_ptr,
+ key_info_ptr,
+ query_weight_ptr,
+ key_weight_ptr,
+ value_ptr,
+ cumulation_value_ptr,
+ batch_size,
+ num_hash_f,
+ num_query,
+ num_key,
+ value_dim,
+ weight_dim
+ );
+ }
+ }
+
+ return cumulation_value;
+
+}
diff --git a/yoso/fast_lsh_cumulation.h b/yoso/fast_lsh_cumulation.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc0bb0bee9d76d413f01668f247bf25dd2e044a9
--- /dev/null
+++ b/yoso/fast_lsh_cumulation.h
@@ -0,0 +1,71 @@
+#include
+#include
+#include
+
+std::vector fast_hash_ver1_kernel(
+ at::Tensor query_mask,
+ at::Tensor query_vector,
+ at::Tensor key_mask,
+ at::Tensor key_vector,
+ int num_hash_f,
+ int hash_code_len,
+ bool use_cuda
+);
+
+at::Tensor lsh_cumulation_ver1_kernel(
+ at::Tensor query_mask,
+ at::Tensor query_hash_code,
+ at::Tensor key_mask,
+ at::Tensor key_hash_code,
+ at::Tensor value,
+ int hashtable_capacity,
+ bool use_cuda
+);
+
+at::Tensor lsh_weighted_cumulation_ver1_kernel(
+ at::Tensor query_mask,
+ at::Tensor query_hash_code,
+ at::Tensor query_weight,
+ at::Tensor key_mask,
+ at::Tensor key_hash_code,
+ at::Tensor key_weight,
+ at::Tensor value,
+ int hashtable_capacity,
+ bool use_cuda
+);
+
+at::Tensor lsh_weighted_cumulation_ver2_kernel(
+ at::Tensor query_mask,
+ at::Tensor query_hash_code,
+ at::Tensor query_weight,
+ at::Tensor key_mask,
+ at::Tensor key_hash_code,
+ at::Tensor key_weight,
+ at::Tensor value,
+ int hashtable_capacity,
+ bool use_cuda
+);
+
+at::Tensor lsh_weighted_cumulation_ver3_kernel(
+ at::Tensor query_mask,
+ at::Tensor query_hash_code,
+ at::Tensor query_weight,
+ at::Tensor key_mask,
+ at::Tensor key_hash_code,
+ at::Tensor key_weight,
+ at::Tensor value,
+ int hashtable_capacity,
+ bool use_cuda
+);
+
+at::Tensor lsh_weighted_cumulation_ver4_kernel(
+ at::Tensor query_mask,
+ at::Tensor query_hash_code,
+ at::Tensor query_weight,
+ at::Tensor key_mask,
+ at::Tensor key_hash_code,
+ at::Tensor key_weight,
+ at::Tensor value,
+ int hashtable_capacity,
+ bool use_cuda
+);
diff --git a/yoso/fast_lsh_cumulation_cuda.cu b/yoso/fast_lsh_cumulation_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..22944e97044659f896451936c6253d5aadd7a769
--- /dev/null
+++ b/yoso/fast_lsh_cumulation_cuda.cu
@@ -0,0 +1,825 @@
+// File from https://github.com/mlpen/YOSO/blob/main/encoders/backbones/efficient_attentions/yoso/yoso_v1/cuda/fast_lsh_cumulation_cuda.cu
+
+#include "fast_lsh_cumulation_cuda.h"
+#include "common_cuda_device.h"
+#include "common_cuda.h"
+#include "common.h"
+#include
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void fast_hadamard_transform(float *vector_buffer, int vector_dim, int dim_idx) {
+ int stride = vector_dim / 2;
+ while (stride > (WARP_SIZE / 2)) {
+ __syncthreads();
+ int sign = 1 - ((dim_idx / stride) % 2) * 2;
+ float val1 = vector_buffer[dim_idx];
+ float val2 = vector_buffer[dim_idx + sign * stride];
+ __syncthreads();
+ vector_buffer[dim_idx] = float(sign) * val1 + val2;
+ stride = stride / 2;
+ }
+
+ float val = vector_buffer[dim_idx];
+ #pragma unroll
+ for (stride = (WARP_SIZE / 2); stride > 0; stride = stride / 2) {
+ int sign = 1 - ((dim_idx / stride) % 2) * 2;
+ val = float(sign) * val + __shfl_xor_sync(FULL_MASK, val, stride);
+ }
+ vector_buffer[dim_idx] = val;
+}
+
+__global__ void fast_hash_ver1_cuda_kernel(
+ int *mask, // [batch_size, num_vector]
+ float *vector, // [batch_size, num_vector, vector_dim]
+ int *Dmat, // [batch_size, 3, num_part, vector_dim]
+ int *hash_code, // [batch_size, num_vector, num_hash_f]
+ int batch_size,
+ int num_vector,
+ int vector_dim,
+ int num_part,
+ int num_hash_f,
+ int hash_code_len
+) {
+
+ int batch_idx = blockIdx.z;
+ int vector_idx = blockIdx.y;
+ int part_idx = blockIdx.x;
+
+ int dim_idx = threadIdx.x;
+
+ int batch_idx__vector_idx = batch_idx * num_vector + vector_idx;
+ if (mask[batch_idx__vector_idx] == 0) {
+ return;
+ }
+
+ extern __shared__ float buffer[];
+ float *vector_buffer = buffer;
+
+ vector_buffer[dim_idx] = vector[batch_idx__vector_idx * vector_dim + dim_idx];
+
+ vector_buffer[dim_idx] = vector_buffer[dim_idx] * (float)Dmat[((batch_idx * 3 + 0) * num_part + part_idx) * vector_dim + dim_idx];
+ fast_hadamard_transform(vector_buffer, vector_dim, dim_idx);
+ vector_buffer[dim_idx] = vector_buffer[dim_idx] * (float)Dmat[((batch_idx * 3 + 1) * num_part + part_idx) * vector_dim + dim_idx];
+ fast_hadamard_transform(vector_buffer, vector_dim, dim_idx);
+ vector_buffer[dim_idx] = vector_buffer[dim_idx] * (float)Dmat[((batch_idx * 3 + 2) * num_part + part_idx) * vector_dim + dim_idx];
+ fast_hadamard_transform(vector_buffer, vector_dim, dim_idx);
+
+ int num_hash_per_part = vector_dim / hash_code_len;
+ if (hash_code_len == 8 || hash_code_len == 16) {
+ int code = select(vector_buffer[dim_idx] > 0, 1 << (dim_idx % hash_code_len), 0);
+ for (int offset = 1; offset < hash_code_len; offset = offset * 2) {
+ code += __shfl_xor_sync(FULL_MASK, code, offset);
+ }
+ if (dim_idx % hash_code_len == 0) {
+ int hash_f_idx = part_idx * num_hash_per_part + dim_idx / hash_code_len;
+ if (hash_f_idx < num_hash_f) {
+ hash_code[batch_idx__vector_idx * num_hash_f + hash_f_idx] = code;
+ }
+ }
+ } else {
+ vector_buffer[dim_idx] = select(vector_buffer[dim_idx] > 0, 1 << (dim_idx % hash_code_len), 0);
+ __syncthreads();
+ if (dim_idx < num_hash_per_part) {
+ int code = 0;
+ for (int i = 0; i < hash_code_len; i++) {
+ code += vector_buffer[dim_idx * hash_code_len + i];
+ }
+ int hash_f_idx = part_idx * num_hash_per_part + dim_idx;
+ if (hash_f_idx < num_hash_f) {
+ hash_code[batch_idx__vector_idx * num_hash_f + hash_f_idx] = code;
+ }
+ }
+ }
+}
+
+__global__ void lsh_cumulation_ver1_step1_cuda_kernel(
+ int *key_mask, // [batch_size, num_key]
+ int *key_hash_code, // [batch_size, num_key, num_hash_f]
+ float *value, // [batch_size, num_key, value_dim]
+ float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity,
+ int num_key,
+ int value_dim,
+ int offset_warp
+) {
+
+ int warp_thread_idx = threadIdx.x;
+
+ int batch_idx = blockIdx.y;
+ int key_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+ int batch_idx__key_idx = batch_idx * num_key + key_idx;
+ if (key_mask[batch_idx__key_idx] == 0) {
+ return;
+ }
+
+ if (num_hash_f > WARP_SIZE) {
+ float warp_value = value[batch_idx__key_idx * value_dim + offset_warp + warp_thread_idx];
+ for (int hash_f_start = 0; hash_f_start < num_hash_f; hash_f_start = hash_f_start + WARP_SIZE) {
+ int warp_hashcode = key_hash_code[batch_idx__key_idx * num_hash_f + hash_f_start + warp_thread_idx];
+ #pragma unroll
+ for (int hash_f_offset = 0; hash_f_offset < WARP_SIZE; hash_f_offset++) {
+ int current_hashcode = warp_hashcode;
+ current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_offset);
+ int hashtable_idx = (batch_idx * num_hash_f + (hash_f_start + hash_f_offset)) * hashtable_capacity + current_hashcode;
+ atomicAdd(&hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx], warp_value);
+ }
+ }
+ } else {
+ float warp_value = value[batch_idx__key_idx * value_dim + offset_warp + warp_thread_idx];
+ int warp_hashcode = 0;
+ if (warp_thread_idx < num_hash_f) {
+ warp_hashcode = key_hash_code[batch_idx__key_idx * num_hash_f + warp_thread_idx];
+ }
+ for (int hash_f_idx = 0; hash_f_idx < num_hash_f; hash_f_idx++) {
+ int current_hashcode = warp_hashcode;
+ current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_idx);
+ int hashtable_idx = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + current_hashcode;
+ atomicAdd(&hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx], warp_value);
+ }
+ }
+
+}
+
+__global__ void lsh_cumulation_ver1_step2_cuda_kernel(
+ int *query_mask, // [batch_size, num_query]
+ int *query_hash_code, // [batch_size, num_query, num_hash_f]
+ float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE]
+ float *cumulation_value, // [batch_size, num_query, value_dim]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity,
+ int num_query,
+ int value_dim,
+ int offset_warp
+) {
+
+ int warp_thread_idx = threadIdx.x;
+
+ int batch_idx = blockIdx.y;
+ int query_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+ int batch_idx__query_idx = batch_idx * num_query + query_idx;
+ if (query_mask[batch_idx__query_idx] == 0) {
+ return;
+ }
+
+ if (num_hash_f > WARP_SIZE) {
+ float warp_value = 0;
+ for (int hash_f_start = 0; hash_f_start < num_hash_f; hash_f_start = hash_f_start + WARP_SIZE) {
+ int warp_hashcode = query_hash_code[batch_idx__query_idx * num_hash_f + hash_f_start + warp_thread_idx];
+ #pragma unroll
+ for (int hash_f_offset = 0; hash_f_offset < WARP_SIZE; hash_f_offset++) {
+ int current_hashcode = warp_hashcode;
+ current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_offset);
+ int hashtable_idx = (batch_idx * num_hash_f + (hash_f_start + hash_f_offset)) * hashtable_capacity + current_hashcode;
+ warp_value = warp_value + hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx];
+ }
+ }
+ cumulation_value[batch_idx__query_idx * value_dim + offset_warp + warp_thread_idx] = warp_value / float(num_hash_f);
+ } else {
+ float warp_value = 0;
+ int warp_hashcode = 0;
+ if (warp_thread_idx < num_hash_f) {
+ warp_hashcode = query_hash_code[batch_idx__query_idx * num_hash_f + warp_thread_idx];
+ }
+ for (int hash_f_idx = 0; hash_f_idx < num_hash_f; hash_f_idx++) {
+ int current_hashcode = warp_hashcode;
+ current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_idx);
+ int hashtable_idx = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + current_hashcode;
+ warp_value = warp_value + hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx];
+ }
+ cumulation_value[batch_idx__query_idx * value_dim + offset_warp + warp_thread_idx] = warp_value / float(num_hash_f);
+ }
+
+}
+
+__global__ void lsh_weighted_cumulation_ver1_step1_cuda_kernel(
+ int *key_mask, // [batch_size, num_key]
+ int *key_hash_code, // [batch_size, num_key, num_hash_f]
+ float *key_weight, // [batch_size, num_key, weight_dim]
+ float *value, // [batch_size, num_key, value_dim]
+ float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity,
+ int num_key,
+ int value_dim,
+ int weight_dim,
+ int offset_warp,
+ int weight_idx
+) {
+
+ int warp_thread_idx = threadIdx.x;
+
+ int batch_idx = blockIdx.y;
+ int key_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+ int batch_idx__key_idx = batch_idx * num_key + key_idx;
+ if (key_mask[batch_idx__key_idx] == 0) {
+ return;
+ }
+
+ if (num_hash_f > WARP_SIZE) {
+ float warp_value = key_weight[batch_idx__key_idx * weight_dim + weight_idx] * value[batch_idx__key_idx * value_dim + offset_warp + warp_thread_idx];
+ for (int hash_f_start = 0; hash_f_start < num_hash_f; hash_f_start = hash_f_start + WARP_SIZE) {
+ int warp_hashcode = key_hash_code[batch_idx__key_idx * num_hash_f + hash_f_start + warp_thread_idx];
+ #pragma unroll
+ for (int hash_f_offset = 0; hash_f_offset < WARP_SIZE; hash_f_offset++) {
+ int current_hashcode = warp_hashcode;
+ current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_offset);
+ int hashtable_idx = (batch_idx * num_hash_f + (hash_f_start + hash_f_offset)) * hashtable_capacity + current_hashcode;
+ atomicAdd(&hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx], warp_value);
+ }
+ }
+ } else {
+ float warp_value = key_weight[batch_idx__key_idx * weight_dim + weight_idx] * value[batch_idx__key_idx * value_dim + offset_warp + warp_thread_idx];
+ int warp_hashcode = 0;
+ if (warp_thread_idx < num_hash_f) {
+ warp_hashcode = key_hash_code[batch_idx__key_idx * num_hash_f + warp_thread_idx];
+ }
+ for (int hash_f_idx = 0; hash_f_idx < num_hash_f; hash_f_idx++) {
+ int current_hashcode = warp_hashcode;
+ current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_idx);
+ int hashtable_idx = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + current_hashcode;
+ atomicAdd(&hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx], warp_value);
+ }
+ }
+
+}
+
+__global__ void lsh_weighted_cumulation_ver1_step2_cuda_kernel(
+ int *query_mask, // [batch_size, num_query]
+ int *query_hash_code, // [batch_size, num_query, num_hash_f]
+ float *query_weight, // [batch_size, num_query, weight_dim]
+ float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE]
+ float *cumulation_value, // [batch_size, num_query, value_dim]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity,
+ int num_query,
+ int value_dim,
+ int weight_dim,
+ int offset_warp,
+ int weight_idx
+) {
+
+ int warp_thread_idx = threadIdx.x;
+
+ int batch_idx = blockIdx.y;
+ int query_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+ int batch_idx__query_idx = batch_idx * num_query + query_idx;
+ if (query_mask[batch_idx__query_idx] == 0) {
+ return;
+ }
+
+ if (num_hash_f > WARP_SIZE) {
+ float warp_value = 0;
+ for (int hash_f_start = 0; hash_f_start < num_hash_f; hash_f_start = hash_f_start + WARP_SIZE) {
+ int warp_hashcode = query_hash_code[batch_idx__query_idx * num_hash_f + hash_f_start + warp_thread_idx];
+ #pragma unroll
+ for (int hash_f_offset = 0; hash_f_offset < WARP_SIZE; hash_f_offset++) {
+ int current_hashcode = warp_hashcode;
+ current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_offset);
+ int hashtable_idx = (batch_idx * num_hash_f + (hash_f_start + hash_f_offset)) * hashtable_capacity + current_hashcode;
+ warp_value = warp_value + hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx];
+ }
+ }
+ float warp_weight = query_weight[batch_idx__query_idx * weight_dim + weight_idx];
+ cumulation_value[batch_idx__query_idx * value_dim + offset_warp + warp_thread_idx] += warp_weight * warp_value / float(num_hash_f);
+ } else {
+ float warp_value = 0;
+ int warp_hashcode = 0;
+ if (warp_thread_idx < num_hash_f) {
+ warp_hashcode = query_hash_code[batch_idx__query_idx * num_hash_f + warp_thread_idx];
+ }
+ for (int hash_f_idx = 0; hash_f_idx < num_hash_f; hash_f_idx++) {
+ int current_hashcode = warp_hashcode;
+ current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_idx);
+ int hashtable_idx = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + current_hashcode;
+ warp_value = warp_value + hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx];
+ }
+ float warp_weight = query_weight[batch_idx__query_idx * weight_dim + weight_idx];
+ cumulation_value[batch_idx__query_idx * value_dim + offset_warp + warp_thread_idx] += warp_weight * warp_value / float(num_hash_f);
+ }
+
+}
+
+__global__ void count_sort_step1_cuda_kernel(
+ int *key_mask, // [batch_size, num_key]
+ int *key_hash_code, // [batch_size, num_key, num_hash_f]
+ int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity,
+ int num_key
+) {
+
+ int batch_idx = blockIdx.y;
+ int key_idx = blockIdx.x * blockDim.y + threadIdx.y;
+ int hash_f_idx = threadIdx.x;
+
+ int batch_idx__key_idx = batch_idx * num_key + key_idx;
+ if (key_mask[batch_idx__key_idx] == 0) {
+ return;
+ }
+
+ int hash_code = key_hash_code[batch_idx__key_idx * num_hash_f + hash_f_idx];
+ atomicAdd(&count_sort_table[(batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + hash_code], 1);
+
+}
+
+__global__ void count_sort_step2_cuda_kernel(
+ int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity
+) {
+
+ int batch_idx = blockIdx.y;
+ int hash_f_idx = blockIdx.x;
+
+ int num_threads = blockDim.x;
+ int thread_id = threadIdx.x;
+
+ int batch_idx__hash_f_idx = batch_idx * num_hash_f + hash_f_idx;
+
+ extern __shared__ float buffer[];
+ int *table_buffer = (int*)buffer;
+
+ if (thread_id == 0) {
+ table_buffer[0] = 0;
+ }
+ copy_data(&count_sort_table[batch_idx__hash_f_idx * hashtable_capacity], &table_buffer[1], hashtable_capacity - 1, num_threads, thread_id);
+
+ for (int table_idx_start = 0; table_idx_start < hashtable_capacity; table_idx_start = table_idx_start + num_threads) {
+ int thread_value = table_buffer[table_idx_start + thread_id];
+ int next_thread_value = 0;
+ for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) {
+ next_thread_value = __shfl_up_sync(FULL_MASK, thread_value, offset);
+ if (thread_id % WARP_SIZE >= offset) {
+ thread_value = thread_value + next_thread_value;
+ }
+ }
+ table_buffer[table_idx_start + thread_id] = thread_value;
+ }
+ __syncthreads();
+
+ if (hashtable_capacity > WARP_SIZE) {
+ if (thread_id < WARP_SIZE) {
+ for (int table_idx_start = WARP_SIZE; table_idx_start < hashtable_capacity; table_idx_start = table_idx_start + WARP_SIZE) {
+ table_buffer[table_idx_start + thread_id] += table_buffer[table_idx_start - 1];
+ }
+ }
+ }
+
+ copy_data(table_buffer, &count_sort_table[batch_idx__hash_f_idx * hashtable_capacity], hashtable_capacity, num_threads, thread_id);
+
+}
+
+
+__global__ void count_sort_step3_cuda_kernel(
+ int *key_mask, // [batch_size, num_key]
+ int *key_hash_code, // [batch_size, num_key, num_hash_f]
+ int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity]
+ int *key_sorted_idxes, // [batch_size, num_hash_f, num_key]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity,
+ int num_key
+) {
+
+ int batch_idx = blockIdx.y;
+ int key_idx = blockIdx.x * blockDim.y + threadIdx.y;
+ int hash_f_idx = threadIdx.x;
+
+ int batch_idx__key_idx = batch_idx * num_key + key_idx;
+ if (key_mask[batch_idx__key_idx] == 0) {
+ return;
+ }
+
+ int batch_idx__hash_f_idx = batch_idx * num_hash_f + hash_f_idx;
+
+ int hash_code = key_hash_code[batch_idx__key_idx * num_hash_f + hash_f_idx];
+ int sort_idx = atomicAdd(&count_sort_table[batch_idx__hash_f_idx * hashtable_capacity + hash_code], 1);
+ key_sorted_idxes[batch_idx__hash_f_idx * num_key + sort_idx] = key_idx;
+
+}
+
+__global__ void extract_query_info_cuda_kernel(
+ int *query_mask, // [batch_size, num_query]
+ int *query_hash_code, // [batch_size, num_query, num_hash_f]
+ int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity]
+ int *query_info, // [batch_size, num_query, 2, num_hash_f]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity,
+ int num_query
+) {
+
+ int batch_idx = blockIdx.y;
+ int query_idx = blockIdx.x * blockDim.y + threadIdx.y;
+ int hash_f_idx = threadIdx.x;
+
+ int batch_idx__query_idx = batch_idx * num_query + query_idx;
+ if (query_mask[batch_idx__query_idx] == 0) {
+ return;
+ }
+
+ int hash_code = query_hash_code[batch_idx__query_idx * num_hash_f + hash_f_idx];
+ int batch_idx__hash_f_idx__hash_code = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + hash_code;
+
+ int key_offset = select(hash_code == 0, 0, count_sort_table[batch_idx__hash_f_idx__hash_code - 1]);
+ int key_count = count_sort_table[batch_idx__hash_f_idx__hash_code] - key_offset;
+
+ query_info[batch_idx__query_idx * 2 * num_hash_f + hash_f_idx] = key_offset;
+ query_info[(batch_idx__query_idx * 2 + 1) * num_hash_f + hash_f_idx] = key_count;
+
+}
+
+__global__ void lsh_weighted_cumulation_ver2_step2_cuda_kernel(
+ int *query_mask, // [batch_size, num_query]
+ int *query_info, // [batch_size, num_query, 2, num_hash_f]
+ int *key_sorted_idxes, // [batch_size, num_hash_f, num_key]
+ float *query_weight, // [batch_size, num_query, weight_dim]
+ float *key_weight, // [batch_size, num_key, weight_dim]
+ float *value, // [batch_size, num_key, value_dim]
+ float *cumulation_value, // [batch_size, num_query, value_dim]
+ int batch_size,
+ int num_hash_f,
+ int num_query,
+ int num_key,
+ int value_dim,
+ int weight_dim
+) {
+
+ int batch_idx = blockIdx.z;
+ int hash_f_idx = blockIdx.y;
+ int query_idx = blockIdx.x;
+
+ int num_threads = blockDim.y * blockDim.x;
+ int thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+ int num_warps = blockDim.y;
+ int warp_idx = threadIdx.y;
+ int warp_thread_idx = threadIdx.x;
+
+ int batch_idx__query_idx = batch_idx * num_query + query_idx;
+ if (query_mask[batch_idx__query_idx] == 0) {
+ return;
+ }
+
+ int key_offset = query_info[batch_idx__query_idx * 2 * num_hash_f + hash_f_idx];
+ int key_count = query_info[(batch_idx__query_idx * 2 + 1) * num_hash_f + hash_f_idx];
+
+ if (key_count == 0) {
+ return;
+ }
+
+ extern __shared__ float buffer[];
+
+ if (key_count == 1) {
+ if (warp_idx == 0) {
+ int key_idx = key_sorted_idxes[(batch_idx * num_hash_f + hash_f_idx) * num_key + key_offset];
+ int batch_idx__key_idx = batch_idx * num_key + key_idx;
+ float weight = 0;
+ for (int weight_offset = 0; weight_offset < weight_dim; weight_offset = weight_offset + WARP_SIZE) {
+ int weight_dim_idx = weight_offset + warp_thread_idx;
+ float val = query_weight[batch_idx__query_idx * weight_dim + weight_dim_idx] * key_weight[batch_idx__key_idx * weight_dim + weight_dim_idx];
+ #pragma unroll
+ for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) {
+ val += __shfl_xor_sync(FULL_MASK, val, offset);
+ }
+ weight = weight + val;
+ }
+ weight = weight / float(num_hash_f);
+ for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) {
+ int value_dim_idx = value_offset + warp_thread_idx;
+ float val = value[batch_idx__key_idx * value_dim + value_dim_idx];
+ atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val);
+ }
+ }
+ } else {
+ float *weight_buffer = buffer;
+ int *key_idxes_buffer = (int*)&buffer[weight_dim];
+
+ copy_data_nonblocking(&query_weight[batch_idx__query_idx * weight_dim], weight_buffer, weight_dim, num_threads, thread_id);
+
+ while (key_count > 0) {
+ int work_size = min(WARP_SIZE, key_count);
+ copy_data_nonblocking(&key_sorted_idxes[(batch_idx * num_hash_f + hash_f_idx) * num_key + key_offset], key_idxes_buffer, work_size, num_threads, thread_id);
+ __syncthreads();
+ for (int work_offset = 0; work_offset < WARP_SIZE; work_offset = work_offset + num_warps) {
+ int work_idx = work_offset + warp_idx;
+ if (work_idx < key_count) {
+ int key_idx = key_idxes_buffer[work_idx];
+ int batch_idx__key_idx = batch_idx * num_key + key_idx;
+ float weight = 0;
+ for (int weight_offset = 0; weight_offset < weight_dim; weight_offset = weight_offset + WARP_SIZE) {
+ int weight_dim_idx = weight_offset + warp_thread_idx;
+ float val = weight_buffer[weight_dim_idx] * key_weight[batch_idx__key_idx * weight_dim + weight_dim_idx];
+ #pragma unroll
+ for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) {
+ val += __shfl_xor_sync(FULL_MASK, val, offset);
+ }
+ weight = weight + val;
+ }
+ weight = weight / float(num_hash_f);
+ for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) {
+ int value_dim_idx = value_offset + warp_thread_idx;
+ float val = value[batch_idx__key_idx * value_dim + value_dim_idx];
+ atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val);
+ }
+ }
+ }
+ key_count = key_count - work_size;
+ key_offset = key_offset + work_size;
+ }
+ }
+
+}
+
+__global__ void lsh_weighted_cumulation_ver3_step2_cuda_kernel(
+ int *query_sorted_idxes, // [batch_size, num_hash_f, num_query]
+ int *key_mask, // [batch_size, num_key]
+ int *key_info, // [batch_size, num_key, 2, num_hash_f]
+ float *query_weight, // [batch_size, num_query, weight_dim]
+ float *key_weight, // [batch_size, num_key, weight_dim]
+ float *value, // [batch_size, num_key, value_dim]
+ float *cumulation_value, // [batch_size, num_query, value_dim]
+ int batch_size,
+ int num_hash_f,
+ int num_query,
+ int num_key,
+ int value_dim,
+ int weight_dim
+) {
+
+ int batch_idx = blockIdx.z;
+ int hash_f_idx = blockIdx.y;
+ int key_idx = blockIdx.x;
+
+ int num_threads = blockDim.y * blockDim.x;
+ int thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+ int num_warps = blockDim.y;
+ int warp_idx = threadIdx.y;
+ int warp_thread_idx = threadIdx.x;
+
+ int batch_idx__key_idx = batch_idx * num_key + key_idx;
+ if (key_mask[batch_idx__key_idx] == 0) {
+ return;
+ }
+
+ int query_offset = key_info[batch_idx__key_idx * 2 * num_hash_f + hash_f_idx];
+ int query_count = key_info[(batch_idx__key_idx * 2 + 1) * num_hash_f + hash_f_idx];
+
+ if (query_count == 0) {
+ return;
+ }
+
+ extern __shared__ float buffer[];
+
+ if (query_count == 1) {
+ if (warp_idx == 0) {
+ int query_idx = query_sorted_idxes[(batch_idx * num_hash_f + hash_f_idx) * num_query + query_offset];
+ int batch_idx__query_idx = batch_idx * num_query + query_idx;
+ float weight = 0;
+ for (int weight_offset = 0; weight_offset < weight_dim; weight_offset = weight_offset + WARP_SIZE) {
+ int weight_dim_idx = weight_offset + warp_thread_idx;
+ float val = key_weight[batch_idx__key_idx * weight_dim + weight_dim_idx] * query_weight[batch_idx__query_idx * weight_dim + weight_dim_idx];
+ #pragma unroll
+ for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) {
+ val += __shfl_xor_sync(FULL_MASK, val, offset);
+ }
+ weight = weight + val;
+ }
+ weight = weight / float(num_hash_f);
+ for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) {
+ int value_dim_idx = value_offset + warp_thread_idx;
+ float val = value[batch_idx__key_idx * value_dim + value_dim_idx];
+ atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val);
+ }
+ }
+ } else {
+ float *weight_buffer = buffer;
+ float *value_buffer = &buffer[weight_dim];
+ int *query_idxes_buffer = (int*)&buffer[weight_dim + value_dim];
+
+ copy_data_nonblocking(&key_weight[batch_idx__key_idx * weight_dim], weight_buffer, weight_dim, num_threads, thread_id);
+ copy_data_nonblocking(&value[batch_idx__key_idx * value_dim], value_buffer, value_dim, num_threads, thread_id);
+
+ while (query_count > 0) {
+ int work_size = min(WARP_SIZE, query_count);
+ copy_data_nonblocking(&query_sorted_idxes[(batch_idx * num_hash_f + hash_f_idx) * num_query + query_offset], query_idxes_buffer, work_size, num_threads, thread_id);
+ __syncthreads();
+ for (int work_offset = 0; work_offset < WARP_SIZE; work_offset = work_offset + num_warps) {
+ int work_idx = work_offset + warp_idx;
+ if (work_idx < query_count) {
+ int query_idx = query_idxes_buffer[work_idx];
+ int batch_idx__query_idx = batch_idx * num_query + query_idx;
+ float weight = 0;
+ for (int weight_offset = 0; weight_offset < weight_dim; weight_offset = weight_offset + WARP_SIZE) {
+ int weight_dim_idx = weight_offset + warp_thread_idx;
+ float val = weight_buffer[weight_dim_idx] * query_weight[batch_idx__query_idx * weight_dim + weight_dim_idx];
+ #pragma unroll
+ for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) {
+ val += __shfl_xor_sync(FULL_MASK, val, offset);
+ }
+ weight = weight + val;
+ }
+ weight = weight / float(num_hash_f);
+ for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) {
+ int value_dim_idx = value_offset + warp_thread_idx;
+ float val = value_buffer[value_dim_idx];
+ atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val);
+ }
+ }
+ }
+ query_count = query_count - work_size;
+ query_offset = query_offset + work_size;
+ }
+ }
+
+}
+
+__global__ void lsh_weighted_cumulation_ver4_step2_cuda_kernel(
+ int *query_sorted_idxes, // [batch_size, num_hash_f, num_query]
+ int *key_mask, // [batch_size, num_key]
+ int *key_info, // [batch_size, num_key, 2, num_hash_f]
+ float *query_weight, // [batch_size, num_query, weight_dim]
+ float *key_weight, // [batch_size, num_key, weight_dim]
+ float *value, // [batch_size, num_key, value_dim]
+ float *cumulation_value, // [batch_size, num_query, value_dim]
+ int batch_size,
+ int num_hash_f,
+ int num_query,
+ int num_key,
+ int value_dim,
+ int weight_dim
+) {
+
+ int batch_idx = blockIdx.y;
+ int key_idx = blockIdx.x;
+
+ int num_threads = blockDim.y * blockDim.x;
+ int thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+ int num_warps = blockDim.y;
+ int warp_idx = threadIdx.y;
+ int warp_thread_idx = threadIdx.x;
+
+ int batch_idx__key_idx = batch_idx * num_key + key_idx;
+ if (key_mask[batch_idx__key_idx] == 0) {
+ return;
+ }
+
+ extern __shared__ float buffer[];
+ float *weight_buffer = buffer;
+ float *value_buffer = &buffer[weight_dim];
+ int *key_info_buffer = (int*)&buffer[weight_dim + value_dim];
+
+ copy_data_nonblocking(&key_weight[batch_idx__key_idx * weight_dim], weight_buffer, weight_dim, num_threads, thread_id);
+ copy_data_nonblocking(&value[batch_idx__key_idx * value_dim], value_buffer, value_dim, num_threads, thread_id);
+ copy_data_nonblocking(&key_info[batch_idx__key_idx * 2 * num_hash_f], key_info_buffer, 2 * num_hash_f, num_threads, thread_id);
+
+ int *query_offset_buffer = key_info_buffer;
+ int *query_count_buffer = &key_info_buffer[num_hash_f];
+
+ const int hashtable_size = 1024 + OPTIMAL_THREADS_PER_BLOCK;
+ __shared__ int hashtable_query[hashtable_size];
+ __shared__ int hashtable_count[hashtable_size];
+ __shared__ int inserted_query[hashtable_size];
+ __shared__ int query_counter[1];
+
+ int hash_f_idx_base = 0;
+
+ while (true) {
+
+ init_buffer_nonblocking(EMPTY_VALUE, hashtable_query, hashtable_size, num_threads, thread_id);
+ init_buffer_nonblocking(0, hashtable_count, hashtable_size, num_threads, thread_id);
+ init_buffer_nonblocking(EMPTY_VALUE, inserted_query, hashtable_size, num_threads, thread_id);
+ init_buffer_nonblocking(0, query_counter, 1, num_threads, thread_id);
+ __syncthreads();
+
+ while (hash_f_idx_base < num_hash_f) {
+
+ int hash_f_idx = hash_f_idx_base + warp_idx;
+ int batch_idx__hash_f_idx = batch_idx * num_hash_f + hash_f_idx;
+
+ int stop_flag = 0;
+
+ int query_offset = query_offset_buffer[hash_f_idx];
+ int query_count = query_count_buffer[hash_f_idx];
+
+ while (query_count > 0) {
+
+ int work_size = min(query_count, WARP_SIZE);
+
+ // try inserting query to set and check whether the query is new
+ int found_new_query = 0;
+ int query_idx = -1;
+ if (warp_thread_idx < work_size) {
+ query_idx = query_sorted_idxes[batch_idx__hash_f_idx * num_query + query_offset + warp_thread_idx];
+ int slot = set_insert(hashtable_query, hashtable_size, query_idx);
+ if (slot >= 0) {
+ found_new_query = atomicAdd(&hashtable_count[slot], 1) == 0;
+ }
+ }
+
+ // compute cumulative offset
+ int position_offset = found_new_query;
+ int next_position_offset = 0;
+ #pragma unroll
+ for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) {
+ next_position_offset = __shfl_up_sync(FULL_MASK, position_offset, offset);
+ if (thread_id % WARP_SIZE >= offset) {
+ position_offset = position_offset + next_position_offset;
+ }
+ }
+
+ // get the inserted query list end index
+ int inserted_query_base = 0;
+ if (thread_id % WARP_SIZE == WARP_SIZE - 1) {
+ inserted_query_base = atomicAdd(query_counter, position_offset);
+ }
+ inserted_query_base = __shfl_sync(FULL_MASK, inserted_query_base, WARP_SIZE - 1);
+
+ // insert new queries to list
+ int insert_idx = inserted_query_base + position_offset - 1;
+ if (found_new_query) {
+ inserted_query[insert_idx] = query_idx;
+ }
+
+ // remove inserted queries from list
+ query_offset_buffer[hash_f_idx] += work_size;
+ query_count_buffer[hash_f_idx] -= work_size;
+ query_offset += work_size;
+ query_count -= work_size;
+
+ // if list is almost full, stop inserting
+ if (inserted_query_base + OPTIMAL_THREADS_PER_BLOCK > hashtable_size) {
+ stop_flag = 1;
+ break;
+ }
+
+ }
+
+ if (stop_flag) {
+ break;
+ }
+
+ hash_f_idx_base = hash_f_idx_base + num_warps;
+
+ }
+
+ __syncthreads();
+
+ int num_distinct_query = query_counter[0];
+
+ if (num_distinct_query > 0) {
+ for (int idx_base = 0; idx_base < num_distinct_query; idx_base = idx_base + num_warps) {
+ int idx = idx_base + warp_idx;
+ if (idx < num_distinct_query) {
+ int query_idx = inserted_query[idx];
+ int batch_idx__query_idx = batch_idx * num_query + query_idx;
+
+ int slot = set_lookup(hashtable_query, hashtable_size, query_idx);
+ int duplicate_count = hashtable_count[slot];
+
+ float weight = 0;
+ for (int weight_idx_base = 0; weight_idx_base < weight_dim; weight_idx_base = weight_idx_base + WARP_SIZE) {
+ int weight_dim_idx = weight_idx_base + warp_thread_idx;
+ float val = weight_buffer[weight_dim_idx] * query_weight[batch_idx__query_idx * weight_dim + weight_dim_idx];
+ #pragma unroll
+ for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) {
+ val += __shfl_xor_sync(FULL_MASK, val, offset);
+ }
+ weight = weight + val;
+ }
+
+ weight = (float)duplicate_count * weight / float(num_hash_f);
+
+ for (int value_idx_base = 0; value_idx_base < value_dim; value_idx_base = value_idx_base + WARP_SIZE) {
+ int value_dim_idx = value_idx_base + warp_thread_idx;
+ float val = value_buffer[value_dim_idx];
+ atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val);
+ }
+ }
+ }
+ } else {
+
+ // all computation is completed if num_distinct_query == 0
+ break;
+
+ }
+
+ __syncthreads();
+
+ }
+
+}
diff --git a/yoso/fast_lsh_cumulation_cuda.h b/yoso/fast_lsh_cumulation_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e89396d10ce177501d1ace14e0873ba89910308
--- /dev/null
+++ b/yoso/fast_lsh_cumulation_cuda.h
@@ -0,0 +1,158 @@
+__global__ void fast_hash_ver1_cuda_kernel(
+ int *mask, // [batch_size, num_vector]
+ float *vector, // [batch_size, num_vector, vector_dim]
+ int *Dmat, // [3, num_part, vector_dim]
+ int *hash_code, // [batch_size, num_vector, num_hash_f]
+ int batch_size,
+ int num_vector,
+ int vector_dim,
+ int num_part,
+ int num_hash_f,
+ int hash_code_len
+ );
+
+ __global__ void lsh_cumulation_ver1_step1_cuda_kernel(
+ int *key_mask, // [batch_size, num_key]
+ int *key_hash_code, // [batch_size, num_key, num_hash_f]
+ float *value, // [batch_size, num_key, value_dim]
+ float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, value_dim]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity,
+ int num_key,
+ int value_dim,
+ int offset_warp
+ );
+
+ __global__ void lsh_cumulation_ver1_step2_cuda_kernel(
+ int *query_mask, // [batch_size, num_query]
+ int *query_hash_code, // [batch_size, num_query, num_hash_f]
+ float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, value_dim]
+ float *cumulation_value, // [batch_size, num_query, value_dim]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity,
+ int num_query,
+ int value_dim,
+ int offset_warp
+ );
+
+ __global__ void lsh_weighted_cumulation_ver1_step1_cuda_kernel(
+ int *key_mask, // [batch_size, num_key]
+ int *key_hash_code, // [batch_size, num_key, num_hash_f]
+ float *key_weight, // [batch_size, num_key, weight_dim]
+ float *value, // [batch_size, num_key, value_dim]
+ float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity,
+ int num_key,
+ int value_dim,
+ int weight_dim,
+ int offset_warp,
+ int weight_idx
+ );
+
+ __global__ void lsh_weighted_cumulation_ver1_step2_cuda_kernel(
+ int *query_mask, // [batch_size, num_query]
+ int *query_hash_code, // [batch_size, num_query, num_hash_f]
+ float *query_weight, // [batch_size, num_query, weight_dim]
+ float *hashtable_value, // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE]
+ float *cumulation_value, // [batch_size, num_query, value_dim]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity,
+ int num_query,
+ int value_dim,
+ int weight_dim,
+ int offset_warp,
+ int weight_idx
+ );
+
+ __global__ void count_sort_step1_cuda_kernel(
+ int *key_mask, // [batch_size, num_key]
+ int *key_hash_code, // [batch_size, num_key, num_hash_f]
+ int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity,
+ int num_key
+ );
+
+ __global__ void count_sort_step2_cuda_kernel(
+ int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity
+ );
+
+ __global__ void count_sort_step3_cuda_kernel(
+ int *key_mask, // [batch_size, num_key]
+ int *key_hash_code, // [batch_size, num_key, num_hash_f]
+ int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity]
+ int *key_sorted_idxes, // [batch_size, num_hash_f, num_key]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity,
+ int num_key
+ );
+
+ __global__ void extract_query_info_cuda_kernel(
+ int *query_mask, // [batch_size, num_query]
+ int *query_hash_code, // [batch_size, num_query, num_hash_f]
+ int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity]
+ int *query_info, // [batch_size, num_query, 2, num_hash_f]
+ int batch_size,
+ int num_hash_f,
+ int hashtable_capacity,
+ int num_query
+ );
+
+ __global__ void lsh_weighted_cumulation_ver2_step2_cuda_kernel(
+ int *query_mask, // [batch_size, num_query]
+ int *query_info, // [batch_size, num_query, 2, num_hash_f]
+ int *key_sorted_idxes, // [batch_size, num_hash_f, num_key]
+ float *query_weight, // [batch_size, num_query, weight_dim]
+ float *key_weight, // [batch_size, num_key, weight_dim]
+ float *value, // [batch_size, num_key, value_dim]
+ float *cumulation_value, // [batch_size, num_query, value_dim]
+ int batch_size,
+ int num_hash_f,
+ int num_query,
+ int num_key,
+ int value_dim,
+ int weight_dim
+ );
+
+ __global__ void lsh_weighted_cumulation_ver3_step2_cuda_kernel(
+ int *query_sorted_idxes, // [batch_size, num_hash_f, num_query]
+ int *key_mask, // [batch_size, num_key]
+ int *key_info, // [batch_size, num_key, 2, num_hash_f]
+ float *query_weight, // [batch_size, num_query, weight_dim]
+ float *key_weight, // [batch_size, num_key, weight_dim]
+ float *value, // [batch_size, num_key, value_dim]
+ float *cumulation_value, // [batch_size, num_query, value_dim]
+ int batch_size,
+ int num_hash_f,
+ int num_query,
+ int num_key,
+ int value_dim,
+ int weight_dim
+ );
+
+ __global__ void lsh_weighted_cumulation_ver4_step2_cuda_kernel(
+ int *query_sorted_idxes, // [batch_size, num_hash_f, num_query]
+ int *key_mask, // [batch_size, num_key]
+ int *key_info, // [batch_size, num_key, 2, num_hash_f]
+ float *query_weight, // [batch_size, num_query, weight_dim]
+ float *key_weight, // [batch_size, num_key, weight_dim]
+ float *value, // [batch_size, num_key, value_dim]
+ float *cumulation_value, // [batch_size, num_query, value_dim]
+ int batch_size,
+ int num_hash_f,
+ int num_query,
+ int num_key,
+ int value_dim,
+ int weight_dim
+ );
+
\ No newline at end of file
diff --git a/yoso/fast_lsh_cumulation_torch.cpp b/yoso/fast_lsh_cumulation_torch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..82a1e08ec5a2af4407e4d85ec399215340ed76d4
--- /dev/null
+++ b/yoso/fast_lsh_cumulation_torch.cpp
@@ -0,0 +1,122 @@
+#include
+#include
+#include "fast_lsh_cumulation.h"
+#include "common_cuda.h"
+#include
+
+std::vector fast_hash(
+ at::Tensor query_mask,
+ at::Tensor query_vector,
+ at::Tensor key_mask,
+ at::Tensor key_vector,
+ int num_hash_f,
+ int hash_code_len,
+ bool use_cuda,
+ int version
+) {
+ return fast_hash_ver1_kernel(
+ query_mask,
+ query_vector,
+ key_mask,
+ key_vector,
+ num_hash_f,
+ hash_code_len,
+ use_cuda
+ );
+}
+
+at::Tensor lsh_cumulation(
+ at::Tensor query_mask, // [batch_size, num_query]
+ at::Tensor query_hash_code, // [batch_size, num_query, num_hash_f]
+ at::Tensor key_mask, // [batch_size, num_key]
+ at::Tensor key_hash_code, // [batch_size, num_key, num_hash_f]
+ at::Tensor value, // [batch_size, num_key, value_dim]
+ int hashtable_capacity,
+ bool use_cuda,
+ int version
+) {
+ return lsh_cumulation_ver1_kernel(
+ query_mask,
+ query_hash_code,
+ key_mask,
+ key_hash_code,
+ value,
+ hashtable_capacity,
+ use_cuda
+ );
+}
+
+at::Tensor lsh_weighted_cumulation(
+ at::Tensor query_mask, // [batch_size, num_query]
+ at::Tensor query_hash_code, // [batch_size, num_query, num_hash_f]
+ at::Tensor query_weight, // [batch_size, num_query, weight_dim]
+ at::Tensor key_mask, // [batch_size, num_key]
+ at::Tensor key_hash_code, // [batch_size, num_key, num_hash_f]
+ at::Tensor key_weight, // [batch_size, num_key, weight_dim]
+ at::Tensor value, // [batch_size, num_key, value_dim]
+ int hashtable_capacity,
+ bool use_cuda,
+ int version
+) {
+ if (version == 1) {
+ return lsh_weighted_cumulation_ver1_kernel(
+ query_mask,
+ query_hash_code,
+ query_weight,
+ key_mask,
+ key_hash_code,
+ key_weight,
+ value,
+ hashtable_capacity,
+ use_cuda
+ );
+ } else if (version == 2) {
+ return lsh_weighted_cumulation_ver2_kernel(
+ query_mask,
+ query_hash_code,
+ query_weight,
+ key_mask,
+ key_hash_code,
+ key_weight,
+ value,
+ hashtable_capacity,
+ use_cuda
+ );
+ } else if (version == 3) {
+ return lsh_weighted_cumulation_ver3_kernel(
+ query_mask,
+ query_hash_code,
+ query_weight,
+ key_mask,
+ key_hash_code,
+ key_weight,
+ value,
+ hashtable_capacity,
+ use_cuda
+ );
+ } else if (version == 4) {
+ return lsh_weighted_cumulation_ver4_kernel(
+ query_mask,
+ query_hash_code,
+ query_weight,
+ key_mask,
+ key_hash_code,
+ key_weight,
+ value,
+ hashtable_capacity,
+ use_cuda
+ );
+ } else {
+ return lsh_weighted_cumulation_ver3_kernel(
+ query_mask,
+ query_hash_code,
+ query_weight,
+ key_mask,
+ key_hash_code,
+ key_weight,
+ value,
+ hashtable_capacity,
+ use_cuda
+ );
+ }
+}