RedHatAI
/

quantization

Model card Files Files and versions

quantization / attention /attention_generic.cuh

danieldk's picture

danieldk HF Staff

Sync to vLLM 20250627

8aa00a3 10 months ago

history blame contribute delete

1.73 kB

	/*
	* Adapted from
	* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
	* Copyright (c) 2023, The vLLM team.
	* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	#pragma once

	#include <stdint.h>

	namespace vllm {

	// A vector type to store Q, K, V elements.
	template <typename T, int VEC_SIZE>
	struct Vec {};

	// A vector type to store FP32 accumulators.
	template <typename T>
	struct FloatVec {};

	// Template vector operations.
	template <typename Acc, typename A, typename B>
	inline __device__ Acc mul(A a, B b);

	template <typename T>
	inline __device__ float sum(T v);

	template <typename T>
	inline __device__ float dot(T a, T b) {
	return sum(mul<T, T, T>(a, b));
	}

	template <typename A, typename T>
	inline __device__ float dot(T a, T b) {
	return sum(mul<A, T, T>(a, b));
	}

	template <typename T>
	inline __device__ void zero(T& dst) {
	constexpr int WORDS = sizeof(T) / 4;
	union {
	T raw;
	uint32_t words[WORDS];
	} tmp;

	#pragma unroll
	for (int ii = 0; ii < WORDS; ++ii) {
	tmp.words[ii] = 0u;
	}
	dst = tmp.raw;
	}

	} // namespace vllm