| [general] |
| name = "paged_attention" |
| universal = false |
|
|
| [torch] |
| src = [ |
| "torch-ext/torch_binding.cpp", |
| "torch-ext/torch_binding.h" |
| ] |
|
|
| [kernel.cuda_utils] |
| backend = "cuda" |
| src = [ |
| "cuda-utils/cuda_utils.h", |
| "cuda-utils/cuda_utils_kernels.cu", |
| ] |
| depends = [] |
|
|
| [kernel.cuda_utils_rocm] |
| backend = "rocm" |
| rocm-archs = [ |
| "gfx906", |
| "gfx908", |
| "gfx90a", |
| "gfx940", |
| "gfx941", |
| "gfx942", |
| "gfx1030", |
| "gfx1100", |
| "gfx1101", |
| ] |
| src = [ |
| "cuda-utils/cuda_utils.h", |
| "cuda-utils/cuda_utils_kernels.cu", |
| ] |
| depends = ["torch"] |
|
|
| [kernel.paged_attention] |
| backend = "cuda" |
| src = [ |
| "cuda-utils/cuda_utils.h", |
| "paged-attention/attention/attention_dtypes.h", |
| "paged-attention/attention/attention_generic.cuh", |
| "paged-attention/attention/attention_kernels.cuh", |
| "paged-attention/attention/attention_utils.cuh", |
| "paged-attention/attention/dtype_bfloat16.cuh", |
| "paged-attention/attention/dtype_float16.cuh", |
| "paged-attention/attention/dtype_float32.cuh", |
| "paged-attention/attention/dtype_fp8.cuh", |
| "paged-attention/attention/paged_attention_v1.cu", |
| "paged-attention/attention/paged_attention_v2.cu", |
| "paged-attention/cache_kernels.cu", |
| "paged-attention/cuda_compat.h", |
| "paged-attention/dispatch_utils.h", |
| "paged-attention/quantization/fp8/amd/quant_utils.cuh", |
| "paged-attention/quantization/fp8/nvidia/quant_utils.cuh", |
| ] |
| include = [ "cuda-utils", "paged-attention" ] |
| depends = [ "torch" ] |
|
|
| [kernel.paged_attention_rocm] |
| backend = "rocm" |
| rocm-archs = [ |
| "gfx906", |
| "gfx908", |
| "gfx90a", |
| "gfx940", |
| "gfx941", |
| "gfx942", |
| "gfx1030", |
| "gfx1100", |
| "gfx1101", |
| ] |
| src = [ |
| "cuda-utils/cuda_utils.h", |
| "paged-attention/attention/attention_dtypes.h", |
| "paged-attention/attention/attention_generic.cuh", |
| "paged-attention/attention/attention_kernels.cuh", |
| "paged-attention/attention/attention_utils.cuh", |
| "paged-attention/attention/dtype_bfloat16.cuh", |
| "paged-attention/attention/dtype_float16.cuh", |
| "paged-attention/attention/dtype_float32.cuh", |
| "paged-attention/attention/dtype_fp8.cuh", |
| "paged-attention/attention/paged_attention_v1.cu", |
| "paged-attention/attention/paged_attention_v2.cu", |
| "paged-attention/cache_kernels.cu", |
| "paged-attention/cuda_compat.h", |
| "paged-attention/dispatch_utils.h", |
| "paged-attention/quantization/fp8/amd/quant_utils.cuh", |
| "paged-attention/quantization/fp8/nvidia/quant_utils.cuh", |
| ] |
| include = [ "cuda-utils", "paged-attention" ] |
| depends = [ "torch" ] |
|
|
| [kernel.paged_attention_metal] |
| backend = "metal" |
| src = [ |
| "paged-attention-metal/attention/paged_attention.metal", |
| "paged-attention-metal/cache/copy_blocks.metal", |
| "paged-attention-metal/cache/reshape_and_cache.metal", |
| "paged-attention-metal/convert_fp8.metal", |
| "paged-attention-metal/float8.metal", |
| "paged-attention-metal/utils.metal", |
| "paged-attention-metal/paged_attention.mm", |
| "paged-attention-metal/cache.mm", |
| "paged-attention-metal/convert_fp8.mm", |
| "paged-attention-metal/device.mm", |
| ] |
| include = [ "." ] |
| depends = [ "torch" ] |
|
|