+
+
+
+
+
+
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "kernels",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+from kernels import get_kernel
+
+# Load the flash attention kernel
+hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn")
+
+
+def hf_flash_attention(query, key, value):
+ """HuggingFace Kernels Flash Attention"""
+ return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ATTENTION,
+ impl_name="hf_kernels_flash_attn",
+ impl_tags={"family": "hf-kernels", "backend": "flash-attn", "compile": "none"},
+ impl_func=hf_flash_attention,
+)
+
+
+
+Running attention benchmark on cuda with 6 workloads. + +====================================================================== +PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_flash_attn 3.54% 153.223us 41.10% 1.781ms 1.781ms 0.000us 0.00% 3.710ms 3.710ms 1 + _flash_attn_9e27194::fwd 1.64% 71.013us 37.57% 1.628ms 542.522us 2.765ms 100.00% 3.710ms 1.237ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.766ms 100.05% 2.766ms 2.766ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.765ms 100.00% 2.765ms 921.626us 3 + Activity Buffer Request 32.85% 1.423ms 32.85% 1.423ms 1.423ms 945.530us 34.20% 945.530us 945.530us 1 + cudaDeviceGetAttribute 0.11% 4.920us 0.11% 4.920us 0.328us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.37% 16.201us 1.19% 51.582us 17.194us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.82% 35.381us 0.82% 35.381us 11.794us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.55% 23.891us 0.55% 23.891us 2.655us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.27% 11.501us 0.27% 11.501us 3.834us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.96% 41.661us 0.96% 41.661us 13.887us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 58.90% 2.552ms 58.90% 2.552ms 2.552ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 4.332ms +Self CUDA time total: 2.765ms + + + +====================================================================== +PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_flash_attn 1.95% 87.173us 36.43% 1.628ms 1.628ms 0.000us 0.00% 3.993ms 3.993ms 1 + _flash_attn_9e27194::fwd 1.10% 49.286us 34.48% 1.541ms 513.554us 2.982ms 100.00% 3.993ms 1.331ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.984ms 100.06% 2.984ms 2.984ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.982ms 100.00% 2.982ms 993.983us 3 + Activity Buffer Request 31.65% 1.414ms 31.65% 1.414ms 1.414ms 1.011ms 33.92% 1.011ms 1.011ms 1 + cudaDeviceGetAttribute 0.09% 3.827us 0.09% 3.827us 0.255us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.16% 7.330us 0.51% 22.831us 7.610us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.35% 15.501us 0.35% 15.501us 5.167us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.46% 20.669us 0.46% 20.669us 2.297us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.08% 3.520us 0.08% 3.520us 1.173us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.59% 26.211us 0.59% 26.211us 8.737us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 63.57% 2.841ms 63.57% 2.841ms 2.841ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 4.469ms +Self CUDA time total: 2.982ms + + + +====================================================================== +PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_flash_attn 2.39% 107.943us 36.87% 1.664ms 1.664ms 0.000us 0.00% 4.011ms 4.011ms 1 + _flash_attn_9e27194::fwd 1.08% 48.663us 34.47% 1.556ms 518.528us 2.994ms 100.00% 4.011ms 1.337ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.996ms 100.05% 2.996ms 2.996ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.994ms 100.00% 2.994ms 998.054us 3 + Activity Buffer Request 31.64% 1.428ms 31.64% 1.428ms 1.428ms 1.017ms 33.96% 1.017ms 1.017ms 1 + cudaDeviceGetAttribute 0.09% 4.050us 0.09% 4.050us 0.270us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.16% 7.029us 0.54% 24.521us 8.174us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.39% 17.492us 0.39% 17.492us 5.831us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.46% 20.589us 0.46% 20.589us 2.288us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.08% 3.660us 0.08% 3.660us 1.220us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.59% 26.452us 0.59% 26.452us 8.817us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 63.13% 2.849ms 63.13% 2.849ms 2.849ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 4.512ms +Self CUDA time total: 2.994ms + + + +====================================================================== +PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_flash_attn 2.37% 113.154us 39.04% 1.864ms 1.864ms 0.000us 0.00% 4.086ms 4.086ms 1 + _flash_attn_9e27194::fwd 1.02% 48.863us 36.67% 1.751ms 583.543us 3.059ms 100.00% 4.086ms 1.362ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.060ms 100.05% 3.060ms 3.060ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.059ms 100.00% 3.059ms 1.020ms 3 + Activity Buffer Request 29.92% 1.429ms 29.92% 1.429ms 1.429ms 1.027ms 33.57% 1.027ms 1.027ms 1 + cudaDeviceGetAttribute 0.08% 3.821us 0.08% 3.821us 0.255us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.16% 7.819us 0.54% 25.920us 8.640us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.38% 18.101us 0.38% 18.101us 6.034us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.44% 21.109us 0.44% 21.109us 2.345us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.08% 3.840us 0.08% 3.840us 1.280us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 4.58% 218.538us 4.58% 218.538us 72.846us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 60.96% 2.910ms 60.96% 2.910ms 2.910ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 4.774ms +Self CUDA time total: 3.059ms + + + +====================================================================== +PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_flash_attn 2.11% 109.115us 34.87% 1.804ms 1.804ms 0.000us 0.00% 4.702ms 4.702ms 1 + _flash_attn_9e27194::fwd 0.94% 48.879us 32.76% 1.695ms 565.076us 3.518ms 100.00% 4.702ms 1.567ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.519ms 100.04% 3.519ms 3.519ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.518ms 100.00% 3.518ms 1.173ms 3 + Activity Buffer Request 27.57% 1.427ms 27.57% 1.427ms 1.427ms 1.184ms 33.66% 1.184ms 1.184ms 1 + cudaDeviceGetAttribute 0.07% 3.810us 0.07% 3.810us 0.254us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.14% 7.040us 0.48% 25.061us 8.354us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.35% 18.021us 0.35% 18.021us 6.007us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.40% 20.762us 0.40% 20.762us 2.307us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.07% 3.731us 0.07% 3.731us 1.244us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.21% 166.285us 3.21% 166.285us 55.428us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 65.13% 3.370ms 65.13% 3.370ms 3.370ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 5.175ms +Self CUDA time total: 3.518ms + + + +====================================================================== +PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + hf_kernels_flash_attn 2.00% 105.404us 33.86% 1.781ms 1.781ms 0.000us 0.00% 4.846ms 4.846ms 1 + _flash_attn_9e27194::fwd 0.97% 50.822us 31.86% 1.675ms 558.446us 3.623ms 100.00% 4.846ms 1.615ms 3 + hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.624ms 100.04% 3.624ms 3.624ms 1 +void flash::flash_fwd_kernel<Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.623ms 100.00% 3.623ms 1.208ms 3 + Activity Buffer Request 26.72% 1.405ms 26.72% 1.405ms 1.405ms 1.223ms 33.77% 1.223ms 1.223ms 1 + cudaDeviceGetAttribute 0.08% 4.369us 0.08% 4.369us 0.291us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty_like 0.15% 7.679us 0.48% 25.141us 8.380us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty_strided 0.33% 17.462us 0.33% 17.462us 5.821us 0.000us 0.00% 0.000us 0.000us 3 + aten::empty 0.40% 21.081us 0.40% 21.081us 2.342us 0.000us 0.00% 0.000us 0.000us 9 + cudaFuncSetAttribute 0.07% 3.770us 0.07% 3.770us 1.257us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.13% 164.746us 3.13% 164.746us 54.915us 0.000us 0.00% 0.000us 0.000us 3 + cudaDeviceSynchronize 66.14% 3.478ms 66.14% 3.478ms 3.478ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 5.259ms +Self CUDA time total: 3.623ms + + +impl wl p50(ms) ok +hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True +hf_kernels_flash_attn cuda_attn_L256_bfloat16 0.99 True +hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.04 True +hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True +hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.21 True +hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.21 True +
+Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
+Fetching 20 files: 10%|█ | 2/20 [00:01<00:16, 1.12it/s]
+Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 11.15it/s]
+
+
+
+