Kernels
danieldk HF Staff commited on
Commit
1c9cbe1
·
verified ·
1 Parent(s): 645c13c

Build uploaded using `kernels`.

Browse files
Files changed (18) hide show
  1. build/torch210-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
  2. build/torch210-cxx11-cu126-aarch64-linux/_ops.py +3 -3
  3. build/torch210-cxx11-cu126-aarch64-linux/flash_attn_interface.py +5 -1
  4. build/torch210-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
  5. build/torch210-cxx11-cu128-aarch64-linux/_ops.py +3 -3
  6. build/torch210-cxx11-cu128-aarch64-linux/flash_attn_interface.py +5 -1
  7. build/torch210-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
  8. build/torch210-cxx11-cu130-aarch64-linux/_ops.py +3 -3
  9. build/torch210-cxx11-cu130-aarch64-linux/flash_attn_interface.py +5 -1
  10. build/torch29-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
  11. build/torch29-cxx11-cu126-aarch64-linux/_ops.py +3 -3
  12. build/torch29-cxx11-cu126-aarch64-linux/flash_attn_interface.py +5 -1
  13. build/torch29-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
  14. build/torch29-cxx11-cu128-aarch64-linux/_ops.py +3 -3
  15. build/torch29-cxx11-cu128-aarch64-linux/flash_attn_interface.py +5 -1
  16. build/torch29-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
  17. build/torch29-cxx11-cu130-aarch64-linux/_ops.py +3 -3
  18. build/torch29-cxx11-cu130-aarch64-linux/flash_attn_interface.py +5 -1
build/torch210-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7507abaa188e0334b03057f4fd4a5a50fe8378765edd644d0867fed70444403
3
  size 448533496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ae4aff3e31a85f7126ce66f329308221464637b4aa3bb4f0db454747509f913
3
  size 448533496
build/torch210-cxx11-cu126-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_9f0ed09
3
- ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_abda3a0
3
+ ops = torch.ops._flash_attn2_cuda_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_abda3a0::{op_name}"
build/torch210-cxx11-cu126-aarch64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0
build/torch210-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58331e99449068ce623bb0d837d6a761092278872c0ffcde29691794b554d5da
3
  size 1037990944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b00f987bd969cfdfc618e12a09631d6ad7dd067f9f3d3b346836bd3e205af5a4
3
  size 1037990944
build/torch210-cxx11-cu128-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_9f0ed09
3
- ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_abda3a0
3
+ ops = torch.ops._flash_attn2_cuda_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_abda3a0::{op_name}"
build/torch210-cxx11-cu128-aarch64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0
build/torch210-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b6f8697387e3cc2d5a17e6a9ce0da37e58d6b70193a0f2e9623ab5fbcd3cea4
3
  size 1008644344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fed0b327e31440e1e9b53a58733d699df86d4c54b22b2c949d6b5815a643aca7
3
  size 1008644344
build/torch210-cxx11-cu130-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_9f0ed09
3
- ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_abda3a0
3
+ ops = torch.ops._flash_attn2_cuda_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_abda3a0::{op_name}"
build/torch210-cxx11-cu130-aarch64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0
build/torch29-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce0ad12fec47de04a14310237b258c8f756ba2645ec91fbcc63c7d289f64b2ac
3
  size 448464272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:597f8c8e32518898fd0439b8e9341474d536692e0d68e3083501bbf0fb35cf6f
3
  size 448464272
build/torch29-cxx11-cu126-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_9f0ed09
3
- ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_abda3a0
3
+ ops = torch.ops._flash_attn2_cuda_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_abda3a0::{op_name}"
build/torch29-cxx11-cu126-aarch64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0
build/torch29-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:faa4e7f179cc4e8c14ef3d3471458f9563072d21b79707c238227adeffe34ad0
3
  size 1037856008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79fbcbef35dfc8d2c35e0e4c7587ffa332ae32c8c98d9727cfa376b51ad644b2
3
  size 1037856008
build/torch29-cxx11-cu128-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_9f0ed09
3
- ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_abda3a0
3
+ ops = torch.ops._flash_attn2_cuda_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_abda3a0::{op_name}"
build/torch29-cxx11-cu128-aarch64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0
build/torch29-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a69d566b56938ac50d625d504fea3c8c781598e4ebb3908b80397879b3cde372
3
  size 1008640480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8feba56f589a7bd5d90cd68d62668029e2d06f8918c85cefd0112caae3c92c04
3
  size 1008640480
build/torch29-cxx11-cu130-aarch64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_9f0ed09
3
- ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_abda3a0
3
+ ops = torch.ops._flash_attn2_cuda_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_abda3a0::{op_name}"
build/torch29-cxx11-cu130-aarch64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0