| |
| * Adapted from |
| * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h |
| * Copyright (c) 2023, The vLLM team. |
| * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| #pragma once |
|
|
| #include <stdint.h> |
|
|
| namespace vllm { |
|
|
| |
| template <typename T, int VEC_SIZE> |
| struct Vec {}; |
|
|
| |
| template <typename T> |
| struct FloatVec {}; |
|
|
| |
| template <typename Acc, typename A, typename B> |
| inline __device__ Acc mul(A a, B b); |
|
|
| template <typename T> |
| inline __device__ float sum(T v); |
|
|
| template <typename T> |
| inline __device__ float dot(T a, T b) { |
| return sum(mul<T, T, T>(a, b)); |
| } |
|
|
| template <typename A, typename T> |
| inline __device__ float dot(T a, T b) { |
| return sum(mul<A, T, T>(a, b)); |
| } |
|
|
| template <typename T> |
| inline __device__ void zero(T& dst) { |
| constexpr int WORDS = sizeof(T) / 4; |
| union { |
| T raw; |
| uint32_t words[WORDS]; |
| } tmp; |
|
|
| #pragma unroll |
| for (int ii = 0; ii < WORDS; ++ii) { |
| tmp.words[ii] = 0u; |
| } |
| dst = tmp.raw; |
| } |
|
|
| } |
|
|