Build uploaded using `kernels`.
Browse files- build/torch210-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
- build/torch210-cxx11-cu126-aarch64-linux/_ops.py +3 -3
- build/torch210-cxx11-cu126-aarch64-linux/flash_attn_interface.py +5 -1
- build/torch210-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
- build/torch210-cxx11-cu128-aarch64-linux/_ops.py +3 -3
- build/torch210-cxx11-cu128-aarch64-linux/flash_attn_interface.py +5 -1
- build/torch210-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
- build/torch210-cxx11-cu130-aarch64-linux/_ops.py +3 -3
- build/torch210-cxx11-cu130-aarch64-linux/flash_attn_interface.py +5 -1
- build/torch29-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
- build/torch29-cxx11-cu126-aarch64-linux/_ops.py +3 -3
- build/torch29-cxx11-cu126-aarch64-linux/flash_attn_interface.py +5 -1
- build/torch29-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
- build/torch29-cxx11-cu128-aarch64-linux/_ops.py +3 -3
- build/torch29-cxx11-cu128-aarch64-linux/flash_attn_interface.py +5 -1
- build/torch29-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
- build/torch29-cxx11-cu130-aarch64-linux/_ops.py +3 -3
- build/torch29-cxx11-cu130-aarch64-linux/flash_attn_interface.py +5 -1
build/torch210-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 448533496
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4ae4aff3e31a85f7126ce66f329308221464637b4aa3bb4f0db454747509f913
|
| 3 |
size 448533496
|
build/torch210-cxx11-cu126-aarch64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_abda3a0::{op_name}"
|
build/torch210-cxx11-cu126-aarch64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|
build/torch210-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1037990944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b00f987bd969cfdfc618e12a09631d6ad7dd067f9f3d3b346836bd3e205af5a4
|
| 3 |
size 1037990944
|
build/torch210-cxx11-cu128-aarch64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_abda3a0::{op_name}"
|
build/torch210-cxx11-cu128-aarch64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|
build/torch210-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1008644344
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fed0b327e31440e1e9b53a58733d699df86d4c54b22b2c949d6b5815a643aca7
|
| 3 |
size 1008644344
|
build/torch210-cxx11-cu130-aarch64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_abda3a0::{op_name}"
|
build/torch210-cxx11-cu130-aarch64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|
build/torch29-cxx11-cu126-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 448464272
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:597f8c8e32518898fd0439b8e9341474d536692e0d68e3083501bbf0fb35cf6f
|
| 3 |
size 448464272
|
build/torch29-cxx11-cu126-aarch64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_abda3a0::{op_name}"
|
build/torch29-cxx11-cu126-aarch64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|
build/torch29-cxx11-cu128-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1037856008
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79fbcbef35dfc8d2c35e0e4c7587ffa332ae32c8c98d9727cfa376b51ad644b2
|
| 3 |
size 1037856008
|
build/torch29-cxx11-cu128-aarch64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_abda3a0::{op_name}"
|
build/torch29-cxx11-cu128-aarch64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|
build/torch29-cxx11-cu130-aarch64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so → _flash_attn2_cuda_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1008640480
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8feba56f589a7bd5d90cd68d62668029e2d06f8918c85cefd0112caae3c92c04
|
| 3 |
size 1008640480
|
build/torch29-cxx11-cu130-aarch64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_abda3a0::{op_name}"
|
build/torch29-cxx11-cu130-aarch64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|