Build uploaded using `kernels`.
Browse files- build/torch210-cxx11-cpu-x86_64-linux/{_flash_attn2_cpu_9f0ed09.abi3.so β _flash_attn2_cpu_abda3a0.abi3.so} +1 -1
- build/torch210-cxx11-cpu-x86_64-linux/_ops.py +3 -3
- build/torch210-cxx11-cpu-x86_64-linux/flash_attn_interface.py +5 -1
- build/torch210-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
- build/torch210-cxx11-cu126-x86_64-linux/_ops.py +3 -3
- build/torch210-cxx11-cu126-x86_64-linux/flash_attn_interface.py +5 -1
- build/torch210-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
- build/torch210-cxx11-cu128-x86_64-linux/_ops.py +3 -3
- build/torch210-cxx11-cu128-x86_64-linux/flash_attn_interface.py +5 -1
- build/torch210-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
- build/torch210-cxx11-cu130-x86_64-linux/_ops.py +3 -3
- build/torch210-cxx11-cu130-x86_64-linux/flash_attn_interface.py +5 -1
- build/{torch29-cxx11-xpu20252-x86_64-linux/_flash_attn2_xpu_9f0ed09.abi3.so β torch210-cxx11-xpu20253-x86_64-linux/_flash_attn2_xpu_abda3a0.abi3.so} +2 -2
- build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py +3 -3
- build/torch210-cxx11-xpu20253-x86_64-linux/flash_attn_interface.py +5 -1
- build/torch29-cxx11-cpu-x86_64-linux/{_flash_attn2_cpu_9f0ed09.abi3.so β _flash_attn2_cpu_abda3a0.abi3.so} +1 -1
- build/torch29-cxx11-cpu-x86_64-linux/_ops.py +3 -3
- build/torch29-cxx11-cpu-x86_64-linux/flash_attn_interface.py +5 -1
- build/torch29-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
- build/torch29-cxx11-cu126-x86_64-linux/_ops.py +3 -3
- build/torch29-cxx11-cu126-x86_64-linux/flash_attn_interface.py +5 -1
- build/torch29-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
- build/torch29-cxx11-cu128-x86_64-linux/_ops.py +3 -3
- build/torch29-cxx11-cu128-x86_64-linux/flash_attn_interface.py +5 -1
- build/torch29-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β _flash_attn2_cuda_abda3a0.abi3.so} +1 -1
- build/torch29-cxx11-cu130-x86_64-linux/_ops.py +3 -3
- build/torch29-cxx11-cu130-x86_64-linux/flash_attn_interface.py +5 -1
- build/{torch210-cxx11-xpu20253-x86_64-linux/_flash_attn2_xpu_9f0ed09.abi3.so β torch29-cxx11-xpu20252-x86_64-linux/_flash_attn2_xpu_abda3a0.abi3.so} +2 -2
- build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py +3 -3
- build/torch29-cxx11-xpu20252-x86_64-linux/flash_attn_interface.py +5 -1
build/torch210-cxx11-cpu-x86_64-linux/{_flash_attn2_cpu_9f0ed09.abi3.so β _flash_attn2_cpu_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1942496
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98686f498c9fba8e4ff02197c83bbe4222be43dec5268ea3f4a1b51b50eee64a
|
| 3 |
size 1942496
|
build/torch210-cxx11-cpu-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cpu_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_cpu_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cpu_abda3a0::{op_name}"
|
build/torch210-cxx11-cpu-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|
build/torch210-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β _flash_attn2_cuda_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 448709016
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d009f413f2eb3e85146168654e36cbc61c3733856b03bd743285b63e4084b713
|
| 3 |
size 448709016
|
build/torch210-cxx11-cu126-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_abda3a0::{op_name}"
|
build/torch210-cxx11-cu126-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|
build/torch210-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β _flash_attn2_cuda_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1037803408
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:109d07ab749761b297b27783dd9ff702dedacd2b7cdb7a57a67e2cf7ac574516
|
| 3 |
size 1037803408
|
build/torch210-cxx11-cu128-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_abda3a0::{op_name}"
|
build/torch210-cxx11-cu128-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|
build/torch210-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β _flash_attn2_cuda_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1009055088
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c7d4d237f20ea46768cde96cc0c982809482b2f874aeb0678d4139d9d8663664
|
| 3 |
size 1009055088
|
build/torch210-cxx11-cu130-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_abda3a0::{op_name}"
|
build/torch210-cxx11-cu130-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|
build/{torch29-cxx11-xpu20252-x86_64-linux/_flash_attn2_xpu_9f0ed09.abi3.so β torch210-cxx11-xpu20253-x86_64-linux/_flash_attn2_xpu_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d4fa4fb6b42430c26695ce71f24cfd3562287538e838bd4fd951d1fc238f0a29
|
| 3 |
+
size 15742736
|
build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_xpu_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_xpu_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_xpu_abda3a0::{op_name}"
|
build/torch210-cxx11-xpu20253-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|
build/torch29-cxx11-cpu-x86_64-linux/{_flash_attn2_cpu_9f0ed09.abi3.so β _flash_attn2_cpu_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1932200
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f48a5de0256294c0baf17e17ff3eafa7c818a6b70c869d6eec74cdff07b994f1
|
| 3 |
size 1932200
|
build/torch29-cxx11-cpu-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cpu_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_cpu_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cpu_abda3a0::{op_name}"
|
build/torch29-cxx11-cpu-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|
build/torch29-cxx11-cu126-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β _flash_attn2_cuda_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 448648752
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9cb31caa7d395686694813cff9742e6a04c492af9bdebf80fd043bb98410d6d2
|
| 3 |
size 448648752
|
build/torch29-cxx11-cu126-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_abda3a0::{op_name}"
|
build/torch29-cxx11-cu126-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|
build/torch29-cxx11-cu128-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β _flash_attn2_cuda_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1037644632
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d58e62ead713b97bf86390e13e331edb26dd00d83c34ac7e9a5e95968493fa11
|
| 3 |
size 1037644632
|
build/torch29-cxx11-cu128-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_abda3a0::{op_name}"
|
build/torch29-cxx11-cu128-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|
build/torch29-cxx11-cu130-x86_64-linux/{_flash_attn2_cuda_9f0ed09.abi3.so β _flash_attn2_cuda_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1009019192
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52ec533d0801a9757a28bbe0531535a4386d84058a4eb8a2b6a721040eb5fb61
|
| 3 |
size 1009019192
|
build/torch29-cxx11-cu130-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_cuda_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_cuda_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_cuda_abda3a0::{op_name}"
|
build/torch29-cxx11-cu130-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|
build/{torch210-cxx11-xpu20253-x86_64-linux/_flash_attn2_xpu_9f0ed09.abi3.so β torch29-cxx11-xpu20252-x86_64-linux/_flash_attn2_xpu_abda3a0.abi3.so}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a96bf55da7a5422a5ca6f06e2deb147962be6d78db5e9569541e8257460ee19
|
| 3 |
+
size 14205096
|
build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import torch
|
| 2 |
-
from . import
|
| 3 |
-
ops = torch.ops.
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
-
return f"
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from . import _flash_attn2_xpu_abda3a0
|
| 3 |
+
ops = torch.ops._flash_attn2_xpu_abda3a0
|
| 4 |
|
| 5 |
def add_op_namespace_prefix(op_name: str):
|
| 6 |
"""
|
| 7 |
Prefix op by namespace.
|
| 8 |
"""
|
| 9 |
+
return f"_flash_attn2_xpu_abda3a0::{op_name}"
|
build/torch29-cxx11-xpu20252-x86_64-linux/flash_attn_interface.py
CHANGED
|
@@ -33,8 +33,12 @@ def _get_device():
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
| 36 |
-
# This should match the block sizes in the CUDA kernel
|
| 37 |
assert head_dim <= 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
major, minor = torch.cuda.get_device_capability(device)
|
| 39 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 40 |
is_sm80 = major == 8 and minor == 0
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _get_block_size_n(device, head_dim, is_dropout, is_causal):
|
|
|
|
| 36 |
assert head_dim <= 256
|
| 37 |
+
|
| 38 |
+
if device.type == "xpu":
|
| 39 |
+
return 64
|
| 40 |
+
|
| 41 |
+
# This should match the block sizes in the CUDA kernel
|
| 42 |
major, minor = torch.cuda.get_device_capability(device)
|
| 43 |
is_sm8x = major == 8 and minor > 0 # Only include sm86 and sm89, exclude sm80 (A100)
|
| 44 |
is_sm80 = major == 8 and minor == 0
|