+
+
+
+
+
+
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "numpy",
+# "torch==2.8.0",
+# "kernels-benchmark-tools",
+# "xformers",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+import xformers.ops as xops
+
+
+def xformers_attention(q, k, v):
+ """xFormers memory efficient attention"""
+ # xFormers expects [batch, seq_len, heads, head_dim]
+ return xops.memory_efficient_attention(q, k, v)
+
+
+run_benchmark(
+ kernel_type=KernelTypeEnum.ATTENTION,
+ impl_name="xformers_meff",
+ impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
+ impl_func=xformers_attention,
+)
+
+
+
+Running attention benchmark on cuda with 6 workloads. + +====================================================================== +PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + xformers_meff 10.99% 493.828us 51.93% 2.334ms 2.334ms 0.000us 0.00% 3.600ms 3.600ms 1 + xformers_flash3::flash_fwd 4.32% 194.118us 40.08% 1.801ms 600.437us 0.000us 0.00% 3.600ms 1.200ms 3 + flash_attn_3::fwd 1.81% 81.292us 35.76% 1.607ms 535.731us 2.714ms 100.00% 3.600ms 1.200ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.716ms 100.05% 2.716ms 2.716ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.714ms 100.00% 2.714ms 904.730us 3 + Activity Buffer Request 31.96% 1.436ms 31.96% 1.436ms 1.436ms 885.349us 32.62% 885.349us 885.349us 1 + aten::empty 0.86% 38.850us 0.86% 38.850us 6.475us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.25% 11.022us 0.25% 11.022us 3.674us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.88% 39.751us 0.88% 39.751us 13.250us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.26% 11.630us 0.87% 38.970us 6.495us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.61% 27.340us 0.61% 27.340us 4.557us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 48.07% 2.160ms 48.07% 2.160ms 2.160ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 4.494ms +Self CUDA time total: 2.714ms + + + +====================================================================== +PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + xformers_meff 7.45% 327.551us 47.96% 2.108ms 2.108ms 0.000us 0.00% 3.684ms 3.684ms 1 + xformers_flash3::flash_fwd 3.56% 156.647us 39.91% 1.754ms 584.750us 0.000us 0.00% 3.684ms 1.228ms 3 + flash_attn_3::fwd 1.31% 57.602us 36.35% 1.598ms 532.534us 2.754ms 100.00% 3.684ms 1.228ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.755ms 100.06% 2.755ms 2.755ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.754ms 100.00% 2.754ms 917.895us 3 + Activity Buffer Request 33.31% 1.464ms 33.31% 1.464ms 1.464ms 930.812us 33.80% 930.812us 930.812us 1 + aten::empty 0.76% 33.251us 0.76% 33.251us 5.542us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.14% 6.040us 0.14% 6.040us 2.013us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.83% 36.590us 0.83% 36.590us 12.197us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.23% 10.130us 0.60% 26.441us 4.407us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.37% 16.311us 0.37% 16.311us 2.719us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 52.04% 2.287ms 52.04% 2.287ms 2.287ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 4.395ms +Self CUDA time total: 2.754ms + + + +====================================================================== +PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + xformers_meff 6.93% 309.631us 45.92% 2.051ms 2.051ms 0.000us 0.00% 3.806ms 3.806ms 1 + xformers_flash3::flash_fwd 3.88% 173.206us 38.45% 1.717ms 572.356us 0.000us 0.00% 3.806ms 1.269ms 3 + flash_attn_3::fwd 1.30% 58.031us 34.57% 1.544ms 514.621us 2.838ms 100.00% 3.806ms 1.269ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.840ms 100.06% 2.840ms 2.840ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.838ms 100.00% 2.838ms 945.948us 3 + Activity Buffer Request 31.70% 1.416ms 31.70% 1.416ms 1.416ms 968.572us 34.13% 968.572us 968.572us 1 + aten::empty 0.70% 31.373us 0.70% 31.373us 5.229us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.510us 0.12% 5.510us 1.837us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 0.74% 33.081us 0.74% 33.081us 11.027us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.19% 8.679us 0.54% 24.060us 4.010us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.34% 15.381us 0.34% 15.381us 2.564us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 54.08% 2.416ms 54.08% 2.416ms 2.416ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 4.466ms +Self CUDA time total: 2.838ms + + + +====================================================================== +PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + xformers_meff 6.70% 313.562us 47.60% 2.227ms 2.227ms 0.000us 0.00% 3.863ms 3.863ms 1 + xformers_flash3::flash_fwd 3.24% 151.796us 40.34% 1.888ms 629.212us 0.000us 0.00% 3.863ms 1.288ms 3 + flash_attn_3::fwd 1.25% 58.574us 37.10% 1.736ms 578.613us 2.888ms 100.00% 3.863ms 1.288ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.890ms 100.06% 2.890ms 2.890ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.888ms 100.00% 2.888ms 962.743us 3 + Activity Buffer Request 30.65% 1.434ms 30.65% 1.434ms 1.434ms 974.434us 33.74% 974.434us 974.434us 1 + aten::empty 0.64% 30.051us 0.64% 30.051us 5.008us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 5.730us 0.12% 5.730us 1.910us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 4.43% 207.206us 4.43% 207.206us 69.069us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.22% 10.139us 0.56% 26.119us 4.353us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.34% 15.980us 0.34% 15.980us 2.663us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 52.40% 2.452ms 52.40% 2.452ms 2.452ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 4.679ms +Self CUDA time total: 2.888ms + + + +====================================================================== +PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + xformers_meff 6.05% 310.689us 42.88% 2.201ms 2.201ms 0.000us 0.00% 4.489ms 4.489ms 1 + xformers_flash3::flash_fwd 2.93% 150.475us 36.35% 1.866ms 622.001us 0.000us 0.00% 4.489ms 1.496ms 3 + flash_attn_3::fwd 1.04% 53.593us 33.42% 1.716ms 571.843us 3.365ms 100.00% 4.489ms 1.496ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.367ms 100.05% 3.367ms 3.367ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.365ms 100.00% 3.365ms 1.122ms 3 + Activity Buffer Request 28.02% 1.439ms 28.02% 1.439ms 1.439ms 1.123ms 33.38% 1.123ms 1.123ms 1 + aten::empty 0.59% 30.191us 0.59% 30.191us 5.032us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.12% 6.030us 0.12% 6.030us 2.010us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.65% 187.166us 3.65% 187.166us 62.389us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.18% 9.272us 0.47% 24.322us 4.054us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.29% 15.050us 0.29% 15.050us 2.508us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 57.12% 2.932ms 57.12% 2.932ms 2.932ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 5.133ms +Self CUDA time total: 3.365ms + + + +====================================================================== +PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16 +====================================================================== +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + xformers_meff 6.40% 331.462us 43.16% 2.236ms 2.236ms 0.000us 0.00% 4.557ms 4.557ms 1 + xformers_flash3::flash_fwd 2.99% 154.686us 36.26% 1.879ms 626.255us 0.000us 0.00% 4.557ms 1.519ms 3 + flash_attn_3::fwd 1.13% 58.511us 33.27% 1.724ms 574.693us 3.413ms 100.00% 4.557ms 1.519ms 3 + xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.415ms 100.05% 3.415ms 3.415ms 1 +void cutlass::device_kernel<flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.413ms 100.00% 3.413ms 1.138ms 3 + Activity Buffer Request 27.70% 1.435ms 27.70% 1.435ms 1.435ms 1.144ms 33.52% 1.144ms 1.144ms 1 + aten::empty 0.61% 31.572us 0.61% 31.572us 5.262us 0.000us 0.00% 0.000us 0.000us 6 + cudaFuncSetAttribute 0.11% 5.890us 0.11% 5.890us 1.963us 0.000us 0.00% 0.000us 0.000us 3 + cudaLaunchKernel 3.72% 192.906us 3.72% 192.906us 64.302us 0.000us 0.00% 0.000us 0.000us 3 + aten::reshape 0.18% 9.270us 0.50% 26.000us 4.333us 0.000us 0.00% 0.000us 0.000us 6 + aten::view 0.32% 16.730us 0.32% 16.730us 2.788us 0.000us 0.00% 0.000us 0.000us 6 + cudaDeviceSynchronize 56.84% 2.946ms 56.84% 2.946ms 2.946ms 0.000us 0.00% 0.000us 0.000us 1 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 5.182ms +Self CUDA time total: 3.413ms + + +impl wl p50(ms) ok +xformers_meff cuda_attn_L128_bfloat16 0.98 True +xformers_meff cuda_attn_L256_bfloat16 1.02 True +xformers_meff cuda_attn_L320_bfloat16 1.07 True +xformers_meff cuda_attn_L384_bfloat16 1.08 True +xformers_meff cuda_attn_L448_bfloat16 1.24 True +xformers_meff cuda_attn_L512_bfloat16 1.23 True +
+
+▶ UV Install Logs
+
+
+
+