+
+
+
+
+
+
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the flash attention 3 kernel
+hf_kernels_flash_attn3 = get_kernel("kernels-community/flash-attn3")
+
+
+def hf_flash_attention3(query, key, value):
+ return hf_kernels_flash_attn3.flash_attn_func(query, key, value, causal=False)[0]
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ATTENTION,
+ impl_name="hf_kernels_flash_attn3",
+ impl_tags={"family": "hf-kernels", "backend": "flash-attn3", "compile": "none"},
+ impl_func=hf_flash_attention3,
+)
+
+
+
+Running attention benchmark on cuda with 6 workloads. + +====================================================================== +PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_flash_attn3 3.80% 163.585us 44.55% 1.916ms 1.916ms 0.000us 0.00% 3.598ms 3.598ms 1 + FlashAttnFunc 3.38% 145.315us 40.75% 1.753ms 584.213us 0.000us 0.00% 3.598ms 1.199ms 3 + _flash_attn3_48fe103_dirty::fwd 1.86% 80.133us 37.37% 1.607ms 535.775us 2.702ms 100.00% 3.598ms 1.199ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.704ms 100.05% 2.704ms 2.704ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.702ms 100.00% 2.702ms 900.800us 3 + Activity Buffer Request 33.08% 1.423ms 33.08% 1.423ms 1.423ms 895.776us 33.15% 895.776us 895.776us 1 + aten::empty 1.02% 43.812us 1.02% 43.812us 7.302us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.30% 13.081us 0.30% 13.081us 4.360us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 1.10% 47.211us 1.10% 47.211us 15.737us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 55.45% 2.385ms 55.45% 2.385ms 2.385ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 4.301ms +Self CUDA time total: 2.702ms + + + +====================================================================== +PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_flash_attn3 2.35% 101.013us 40.06% 1.725ms 1.725ms 0.000us 0.00% 3.751ms 3.751ms 1 + FlashAttnFunc 2.16% 92.983us 37.71% 1.624ms 541.352us 0.000us 0.00% 3.751ms 1.250ms 3 + _flash_attn3_48fe103_dirty::fwd 1.19% 51.175us 35.55% 1.531ms 510.358us 2.802ms 100.00% 3.751ms 1.250ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.803ms 100.06% 2.803ms 2.803ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.802ms 100.00% 2.802ms 933.921us 3 + Activity Buffer Request 32.90% 1.417ms 32.90% 1.417ms 1.417ms 949.686us 33.90% 949.686us 949.686us 1 + aten::empty 0.63% 27.091us 0.63% 27.091us 4.515us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.239us 0.12% 5.239us 1.746us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.72% 30.870us 0.72% 30.870us 10.290us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 59.94% 2.581ms 59.94% 2.581ms 2.581ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 4.306ms +Self CUDA time total: 2.802ms + + + +====================================================================== +PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_flash_attn3 2.33% 100.994us 40.09% 1.739ms 1.739ms 0.000us 0.00% 3.778ms 3.778ms 1 + FlashAttnFunc 2.19% 94.944us 37.76% 1.638ms 545.852us 0.000us 0.00% 3.778ms 1.259ms 3 + _flash_attn3_48fe103_dirty::fwd 1.20% 52.112us 35.57% 1.543ms 514.204us 2.819ms 100.00% 3.778ms 1.259ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.820ms 100.05% 2.820ms 2.820ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.819ms 100.00% 2.819ms 939.550us 3 + Activity Buffer Request 32.79% 1.422ms 32.79% 1.422ms 1.422ms 959.198us 34.03% 959.198us 959.198us 1 + aten::empty 0.60% 26.051us 0.60% 26.051us 4.342us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.409us 0.12% 5.409us 1.803us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.85% 36.931us 0.85% 36.931us 12.310us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 59.91% 2.599ms 59.91% 2.599ms 2.599ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 4.337ms +Self CUDA time total: 2.819ms + + + +====================================================================== +PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_flash_attn3 2.88% 135.094us 43.08% 2.020ms 2.020ms 0.000us 0.00% 3.874ms 3.874ms 1 + FlashAttnFunc 2.10% 98.504us 40.20% 1.885ms 628.185us 0.000us 0.00% 3.874ms 1.291ms 3 + _flash_attn3_48fe103_dirty::fwd 1.10% 51.632us 38.10% 1.786ms 595.350us 2.895ms 100.00% 3.874ms 1.291ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.897ms 100.06% 2.897ms 2.897ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.895ms 100.00% 2.895ms 965.011us 3 + Activity Buffer Request 30.58% 1.434ms 30.58% 1.434ms 1.434ms 979.229us 33.82% 979.229us 979.229us 1 + aten::empty 0.58% 27.080us 0.58% 27.080us 4.513us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.380us 0.11% 5.380us 1.793us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 5.72% 268.289us 5.72% 268.289us 89.430us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 56.92% 2.668ms 56.92% 2.668ms 2.668ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 4.688ms +Self CUDA time total: 2.895ms + + + +====================================================================== +PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_flash_attn3 2.52% 128.963us 37.26% 1.903ms 1.903ms 0.000us 0.00% 4.575ms 4.575ms 1 + FlashAttnFunc 1.87% 95.425us 34.74% 1.774ms 591.441us 0.000us 0.00% 4.575ms 1.525ms 3 + _flash_attn3_48fe103_dirty::fwd 1.01% 51.593us 32.87% 1.679ms 559.632us 3.427ms 100.00% 4.575ms 1.525ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.429ms 100.05% 3.429ms 3.429ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.427ms 100.00% 3.427ms 1.142ms 3 + Activity Buffer Request 27.82% 1.421ms 27.82% 1.421ms 1.421ms 1.148ms 33.49% 1.148ms 1.148ms 1 + aten::empty 0.55% 28.251us 0.55% 28.251us 4.709us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.10% 5.249us 0.10% 5.249us 1.750us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.38% 172.866us 3.38% 172.866us 57.622us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 62.74% 3.205ms 62.74% 3.205ms 3.205ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 5.108ms +Self CUDA time total: 3.427ms + + + +====================================================================== +PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_flash_attn3 2.37% 119.165us 36.69% 1.842ms 1.842ms 0.000us 0.00% 4.545ms 4.545ms 1 + FlashAttnFunc 1.86% 93.463us 34.32% 1.723ms 574.423us 0.000us 0.00% 4.545ms 1.515ms 3 + _flash_attn3_48fe103_dirty::fwd 1.01% 50.561us 32.46% 1.630ms 543.268us 3.398ms 100.00% 4.545ms 1.515ms 3 + hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.400ms 100.05% 3.400ms 3.400ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.398ms 100.00% 3.398ms 1.133ms 3 + Activity Buffer Request 27.47% 1.379ms 27.47% 1.379ms 1.379ms 1.147ms 33.76% 1.147ms 1.147ms 1 + aten::empty 0.56% 28.202us 0.56% 28.202us 4.700us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.10% 5.090us 0.10% 5.090us 1.697us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.32% 166.515us 3.32% 166.515us 55.505us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 63.31% 3.179ms 63.31% 3.179ms 3.179ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 5.022ms +Self CUDA time total: 3.398ms + + +impl wl p50(ms) ok +hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True +hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.96 True +hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True +hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True +hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True +hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True +
+
+▶ UV Install Logs
+
+Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
+Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.06it/s]
+Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.12it/s]
+
+
+