Kernels
danieldk HF Staff commited on
Commit
5222e7a
Β·
verified Β·
1 Parent(s): 1c9cbe1

Build uploaded using `kernels`.

Browse files
Files changed (30) hide show
  1. build/torch210-cxx11-cpu-x86_64-linux/{_flash_attn2_cpu_9f0ed09.abi3.so β†’ _flash_attn2_cpu_abda3a0.abi3.so} +1 -1
  2. build/torch210-cxx11-cpu-x86_64-linux/_ops.py +3 -3
  3. build/torch210-cxx11-cpu-x86_64-linux/flash_attn_interface.py +5 -1
  4. build/torch210-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β†’ _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
  5. build/torch210-cxx11-cu126-x86_64-linux/_ops.py +3 -3
  6. build/torch210-cxx11-cu126-x86_64-linux/flash_attn_interface.py +5 -1
  7. build/torch210-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β†’ _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
  8. build/torch210-cxx11-cu128-x86_64-linux/_ops.py +3 -3
  9. build/torch210-cxx11-cu128-x86_64-linux/flash_attn_interface.py +5 -1
  10. build/torch210-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β†’ _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
  11. build/torch210-cxx11-cu130-x86_64-linux/_ops.py +3 -3
  12. build/torch210-cxx11-cu130-x86_64-linux/flash_attn_interface.py +5 -1
  13. build/{torch29-cxx11-xpu20252-x86_64-linux/_flash_attn2_xpu_9f0ed09.abi3.so β†’ torch210-cxx11-xpu20253-x86_64-linux/_flash_attn2_xpu_abda3a0.abi3.so} +2 -2
  14. build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py +3 -3
  15. build/torch210-cxx11-xpu20253-x86_64-linux/flash_attn_interface.py +5 -1
  16. build/torch29-cxx11-cpu-x86_64-linux/{_flash_attn2_cpu_9f0ed09.abi3.so β†’ _flash_attn2_cpu_abda3a0.abi3.so} +1 -1
  17. build/torch29-cxx11-cpu-x86_64-linux/_ops.py +3 -3
  18. build/torch29-cxx11-cpu-x86_64-linux/flash_attn_interface.py +5 -1
  19. build/torch29-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β†’ _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
  20. build/torch29-cxx11-cu126-x86_64-linux/_ops.py +3 -3
  21. build/torch29-cxx11-cu126-x86_64-linux/flash_attn_interface.py +5 -1
  22. build/torch29-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β†’ _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
  23. build/torch29-cxx11-cu128-x86_64-linux/_ops.py +3 -3
  24. build/torch29-cxx11-cu128-x86_64-linux/flash_attn_interface.py +5 -1
  25. build/torch29-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β†’ _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
  26. build/torch29-cxx11-cu130-x86_64-linux/_ops.py +3 -3
  27. build/torch29-cxx11-cu130-x86_64-linux/flash_attn_interface.py +5 -1
  28. build/{torch210-cxx11-xpu20253-x86_64-linux/_flash_attn2_xpu_9f0ed09.abi3.so β†’ torch29-cxx11-xpu20252-x86_64-linux/_flash_attn2_xpu_abda3a0.abi3.so} +2 -2
  29. build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py +3 -3
  30. build/torch29-cxx11-xpu20252-x86_64-linux/flash_attn_interface.py +5 -1
build/torch210-cxx11-cpu-x86_64-linux/{_flash_attn2_cpu_9f0ed09.abi3.so β†’ _flash_attn2_cpu_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45757a80c809dcbf1a8c75bde0c42dcba171960901f1c085699036d908a81c20
3
  size 1942496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98686f498c9fba8e4ff02197c83bbe4222be43dec5268ea3f4a1b51b50eee64a
3
  size 1942496
build/torch210-cxx11-cpu-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cpu_9f0ed09
3
- ops = torch.ops._flash_attn2_cpu_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cpu_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cpu_abda3a0
3
+ ops = torch.ops._flash_attn2_cpu_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cpu_abda3a0::{op_name}"
build/torch210-cxx11-cpu-x86_64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0
build/torch210-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β†’ _flash_attn2_cuda_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:039e336f68c3efaa02ef0b103a2607b2a544d68a46fa0165e2433464f26223a3
3
  size 448709016
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d009f413f2eb3e85146168654e36cbc61c3733856b03bd743285b63e4084b713
3
  size 448709016
build/torch210-cxx11-cu126-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_9f0ed09
3
- ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_abda3a0
3
+ ops = torch.ops._flash_attn2_cuda_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_abda3a0::{op_name}"
build/torch210-cxx11-cu126-x86_64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0
build/torch210-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β†’ _flash_attn2_cuda_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac175faf91cfb9cd9827985bae32380035cb9f880f0bbb702e7f045eee90ae0a
3
  size 1037803408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:109d07ab749761b297b27783dd9ff702dedacd2b7cdb7a57a67e2cf7ac574516
3
  size 1037803408
build/torch210-cxx11-cu128-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_9f0ed09
3
- ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_abda3a0
3
+ ops = torch.ops._flash_attn2_cuda_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_abda3a0::{op_name}"
build/torch210-cxx11-cu128-x86_64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0
build/torch210-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β†’ _flash_attn2_cuda_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e67e00b79ea3625b0ec32a083544d1808cce682dfe593a7212c525a292d1764f
3
  size 1009055088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7d4d237f20ea46768cde96cc0c982809482b2f874aeb0678d4139d9d8663664
3
  size 1009055088
build/torch210-cxx11-cu130-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_9f0ed09
3
- ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_abda3a0
3
+ ops = torch.ops._flash_attn2_cuda_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_abda3a0::{op_name}"
build/torch210-cxx11-cu130-x86_64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0
build/{torch29-cxx11-xpu20252-x86_64-linux/_flash_attn2_xpu_9f0ed09.abi3.so β†’ torch210-cxx11-xpu20253-x86_64-linux/_flash_attn2_xpu_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:262841040bd11d2ea11f317107fdc9484d864db0377b423eb9007f4c8a7eb74f
3
- size 13923672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4fa4fb6b42430c26695ce71f24cfd3562287538e838bd4fd951d1fc238f0a29
3
+ size 15742736
build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_xpu_9f0ed09
3
- ops = torch.ops._flash_attn2_xpu_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_xpu_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_xpu_abda3a0
3
+ ops = torch.ops._flash_attn2_xpu_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_xpu_abda3a0::{op_name}"
build/torch210-cxx11-xpu20253-x86_64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0
build/torch29-cxx11-cpu-x86_64-linux/{_flash_attn2_cpu_9f0ed09.abi3.so β†’ _flash_attn2_cpu_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7313920d802d0477ca3d4144bc3b11e3c4761ea0e42b55a9b1b0b05567d23f71
3
  size 1932200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f48a5de0256294c0baf17e17ff3eafa7c818a6b70c869d6eec74cdff07b994f1
3
  size 1932200
build/torch29-cxx11-cpu-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cpu_9f0ed09
3
- ops = torch.ops._flash_attn2_cpu_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cpu_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cpu_abda3a0
3
+ ops = torch.ops._flash_attn2_cpu_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cpu_abda3a0::{op_name}"
build/torch29-cxx11-cpu-x86_64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0
build/torch29-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β†’ _flash_attn2_cuda_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d422421456bf5ac34486ee898a7a6aaea7fff2edda3bce062d0283f69806275
3
  size 448648752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cb31caa7d395686694813cff9742e6a04c492af9bdebf80fd043bb98410d6d2
3
  size 448648752
build/torch29-cxx11-cu126-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_9f0ed09
3
- ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_abda3a0
3
+ ops = torch.ops._flash_attn2_cuda_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_abda3a0::{op_name}"
build/torch29-cxx11-cu126-x86_64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0
build/torch29-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β†’ _flash_attn2_cuda_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29b22b8c1bbd77125b6c82aca5fecfe0416d2f116be7b1e1a4638f76fe542a2e
3
  size 1037644632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d58e62ead713b97bf86390e13e331edb26dd00d83c34ac7e9a5e95968493fa11
3
  size 1037644632
build/torch29-cxx11-cu128-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_9f0ed09
3
- ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_abda3a0
3
+ ops = torch.ops._flash_attn2_cuda_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_abda3a0::{op_name}"
build/torch29-cxx11-cu128-x86_64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0
build/torch29-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β†’ _flash_attn2_cuda_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9e000e77b2d5f5b8554c1ba0e1edfc173bd19d904b95eede3f9cc7ecefbcf89
3
  size 1009019192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52ec533d0801a9757a28bbe0531535a4386d84058a4eb8a2b6a721040eb5fb61
3
  size 1009019192
build/torch29-cxx11-cu130-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_cuda_9f0ed09
3
- ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_cuda_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_abda3a0
3
+ ops = torch.ops._flash_attn2_cuda_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_abda3a0::{op_name}"
build/torch29-cxx11-cu130-x86_64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0
build/{torch210-cxx11-xpu20253-x86_64-linux/_flash_attn2_xpu_9f0ed09.abi3.so β†’ torch29-cxx11-xpu20252-x86_64-linux/_flash_attn2_xpu_abda3a0.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:192cfb14df060c1fd913c7a6e5a588da44dec07d0a9adf156ed377d860c8d3c2
3
- size 15436096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a96bf55da7a5422a5ca6f06e2deb147962be6d78db5e9569541e8257460ee19
3
+ size 14205096
build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_xpu_9f0ed09
3
- ops = torch.ops._flash_attn2_xpu_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_xpu_9f0ed09::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_xpu_abda3a0
3
+ ops = torch.ops._flash_attn2_xpu_abda3a0
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_xpu_abda3a0::{op_name}"
build/torch29-cxx11-xpu20252-x86_64-linux/flash_attn_interface.py CHANGED
@@ -33,8 +33,12 @@ def _get_device():
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
- # This should match the block sizes in the CUDA kernel
37
  assert head_dim <= 256
 
 
 
 
 
38
  major, minor = torch.cuda.get_device_capability(device)
39
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
40
  is_sm80 = major == 8 and minor == 0
 
33
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
 
36
  assert head_dim <= 256
37
+
38
+ if device.type == "xpu":
39
+ return 64
40
+
41
+ # This should match the block sizes in the CUDA kernel
42
  major, minor = torch.cuda.get_device_capability(device)
43
  is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
44
  is_sm80 = major == 8 and minor == 0