add some activation kernels

#3
by medmekk - opened
This view is limited to 50 files because it contains too many changes. See the raw diff here.
Files changed (50) hide show
  1. .gitattributes +0 -6
  2. README.md +3 -19
  3. activation/activation_kernels.cu +225 -0
  4. activation/cuda_compat.h +49 -0
  5. activation/dispatch_utils.h +83 -0
  6. benchmarks/benchmark.py +0 -5
  7. build.toml +18 -0
  8. build/torch210-cu128-x86_64-windows/activation/__init__.py +0 -75
  9. build/torch210-cu128-x86_64-windows/activation/_ops.py +0 -9
  10. build/torch210-cu128-x86_64-windows/activation/layers.py +0 -201
  11. build/torch210-cu128-x86_64-windows/metadata.json +0 -4
  12. build/torch210-cxx11-cu126-aarch64-linux/activation/__init__.py +0 -26
  13. build/torch210-cxx11-cu126-aarch64-linux/layers.py +0 -201
  14. build/torch210-cxx11-cu126-aarch64-linux/metadata.json +0 -18
  15. build/torch210-cxx11-cu126-x86_64-linux/activation/__init__.py +0 -26
  16. build/torch210-cxx11-cu126-x86_64-linux/layers.py +0 -201
  17. build/torch210-cxx11-cu126-x86_64-linux/metadata.json +0 -18
  18. build/torch210-cxx11-cu128-aarch64-linux/activation/__init__.py +0 -26
  19. build/torch210-cxx11-cu128-aarch64-linux/layers.py +0 -201
  20. build/torch210-cxx11-cu128-aarch64-linux/metadata.json +0 -21
  21. build/torch210-cxx11-cu128-x86_64-linux/_activation_cuda_ccf9ce9.abi3.so +0 -3
  22. build/torch210-cxx11-cu128-x86_64-linux/_ops.py +0 -9
  23. build/torch210-cxx11-cu128-x86_64-linux/activation/__init__.py +0 -26
  24. build/torch210-cxx11-cu128-x86_64-linux/layers.py +0 -201
  25. build/torch210-cxx11-cu128-x86_64-linux/metadata.json +0 -21
  26. build/torch210-cxx11-cu130-aarch64-linux/__init__.py +0 -75
  27. build/torch210-cxx11-cu130-aarch64-linux/_activation_cuda_ccf9ce9.abi3.so +0 -3
  28. build/torch210-cxx11-cu130-aarch64-linux/_ops.py +0 -9
  29. build/torch210-cxx11-cu130-aarch64-linux/activation/__init__.py +0 -26
  30. build/torch210-cxx11-cu130-aarch64-linux/layers.py +0 -201
  31. build/torch210-cxx11-cu130-aarch64-linux/metadata.json +0 -19
  32. build/torch210-cxx11-cu130-x86_64-linux/__init__.py +0 -75
  33. build/torch210-cxx11-cu130-x86_64-linux/_activation_cuda_ccf9ce9.abi3.so +0 -3
  34. build/torch210-cxx11-cu130-x86_64-linux/_ops.py +0 -9
  35. build/torch210-cxx11-cu130-x86_64-linux/activation/__init__.py +0 -26
  36. build/torch210-cxx11-cu130-x86_64-linux/layers.py +0 -201
  37. build/torch210-cxx11-cu130-x86_64-linux/metadata.json +0 -19
  38. build/torch210-metal-aarch64-darwin/__init__.py +0 -75
  39. build/torch210-metal-aarch64-darwin/_activation_metal_ccf9ce9.abi3.so +0 -3
  40. build/torch210-metal-aarch64-darwin/_ops.py +0 -9
  41. build/torch210-metal-aarch64-darwin/activation/__init__.py +0 -26
  42. build/torch210-metal-aarch64-darwin/layers.py +0 -201
  43. build/torch210-metal-aarch64-darwin/metadata.json +0 -8
  44. build/torch211-cu128-x86_64-windows/__init__.py +0 -75
  45. build/torch211-cu128-x86_64-windows/_activation_cuda_ce29f4e.pyd +0 -3
  46. build/torch211-cu128-x86_64-windows/_ops.py +0 -9
  47. build/torch211-cu128-x86_64-windows/activation/__init__.py +0 -26
  48. build/torch211-cu128-x86_64-windows/layers.py +0 -201
  49. build/torch211-cu128-x86_64-windows/metadata.json +0 -21
  50. build/torch211-cxx11-cu126-aarch64-linux/__init__.py +0 -75
.gitattributes CHANGED
@@ -34,9 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.so filter=lfs diff=lfs merge=lfs -text
37
- build/torch210-cu128-x86_64-windows/activation/_activation_e1b4b08.pyd filter=lfs diff=lfs merge=lfs -text
38
- media/benches.gif filter=lfs diff=lfs merge=lfs -text
39
- media/benches.mp4 filter=lfs diff=lfs merge=lfs -text
40
- build/torch211-cu128-x86_64-windows/_activation_cuda_47d6ce5.pyd filter=lfs diff=lfs merge=lfs -text
41
- build/torch211-cu128-x86_64-windows/_activation_cuda_edabc93.pyd filter=lfs diff=lfs merge=lfs -text
42
- build/torch211-cu128-x86_64-windows/_activation_cuda_ce29f4e.pyd filter=lfs diff=lfs merge=lfs -text
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.so filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
README.md CHANGED
@@ -1,26 +1,10 @@
1
  ---
2
  tags:
3
- - kernels
4
  ---
5
 
 
6
 
7
  ## Activation
8
 
9
- Activation kernels from [vLLM](https://github.com/vllm-project/vllm/blob/main/csrc/activation_kernels.cu).
10
-
11
- Kernel source: https://github.com/huggingface/kernels-community/tree/main/activation
12
-
13
- ### Performance
14
-
15
-
16
- <img class="dark:hidden border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_light_animation.svg" />
17
- <img class="hidden dark:block border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_dark_animation.svg" />
18
-
19
- <img class="dark:hidden border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_light_latency.svg" />
20
- <img class="hidden dark:block border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_dark_latency.svg" />
21
-
22
- <img class="dark:hidden border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_light_throughput.svg" />
23
- <img class="hidden dark:block border border-gray-200 dark:border-gray-700 rounded-lg" src="media/benches_dark_throughput.svg" />
24
-
25
-
26
-
 
1
  ---
2
  tags:
3
+ - kernel
4
  ---
5
 
6
+ ![Status](https://hubwebhook.dholtz.com/shield?repo=kernels-community/activation)
7
 
8
  ## Activation
9
 
10
+ Activation kernels from [vLLM](https://github.com/vllm-project/vllm/blob/main/csrc/activation_kernels.cu).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
activation/activation_kernels.cu ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <ATen/cuda/CUDAContext.h>
2
+ #include <torch/all.h>
3
+ #include <c10/cuda/CUDAGuard.h>
4
+
5
+ #include <cmath>
6
+
7
+ #include "cuda_compat.h"
8
+ #include "dispatch_utils.h"
9
+
10
+ namespace vllm {
11
+
12
+ template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
13
+ bool act_first>
14
+ __device__ __forceinline__ scalar_t compute(const scalar_t& x,
15
+ const scalar_t& y) {
16
+ return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
17
+ }
18
+ // Activation and gating kernel template.
19
+
20
+ template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
21
+ bool act_first>
22
+ __global__ void act_and_mul_kernel(
23
+ scalar_t* __restrict__ out, // [..., d]
24
+ const scalar_t* __restrict__ input, // [..., 2, d]
25
+ const int d) {
26
+ const int64_t token_idx = blockIdx.x;
27
+ for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
28
+ const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
29
+ const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
30
+ out[token_idx * d + idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
31
+ }
32
+ }
33
+
34
+ template <typename T>
35
+ __device__ __forceinline__ T silu_kernel(const T& x) {
36
+ // x * sigmoid(x)
37
+ return (T)(((float)x) / (1.0f + expf((float)-x)));
38
+ }
39
+
40
+ template <typename T>
41
+ __device__ __forceinline__ T gelu_kernel(const T& x) {
42
+ // Equivalent to PyTorch GELU with 'none' approximation.
43
+ // Refer to:
44
+ // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
45
+ const float f = (float)x;
46
+ constexpr float ALPHA = M_SQRT1_2;
47
+ return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
48
+ }
49
+
50
+ template <typename T>
51
+ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
52
+ // Equivalent to PyTorch GELU with 'tanh' approximation.
53
+ // Refer to:
54
+ // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
55
+ const float f = (float)x;
56
+ constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
57
+ constexpr float KAPPA = 0.044715;
58
+ float x_cube = f * f * f;
59
+ float inner = BETA * (f + KAPPA * x_cube);
60
+ return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
61
+ }
62
+
63
+ } // namespace vllm
64
+
65
+ // Launch activation and gating kernel.
66
+ // Use ACT_FIRST (bool) indicating whether to apply the activation function
67
+ // first.
68
+ #define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, ACT_FIRST) \
69
+ int d = input.size(-1) / 2; \
70
+ int64_t num_tokens = input.numel() / input.size(-1); \
71
+ dim3 grid(num_tokens); \
72
+ dim3 block(std::min(d, 1024)); \
73
+ if (num_tokens == 0) { \
74
+ return; \
75
+ } \
76
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
77
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
78
+ VLLM_DISPATCH_FLOATING_TYPES( \
79
+ input.scalar_type(), "act_and_mul_kernel", [&] { \
80
+ vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>, ACT_FIRST> \
81
+ <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
82
+ input.data_ptr<scalar_t>(), d); \
83
+ });
84
+
85
+ void silu_and_mul(torch::Tensor& out, // [..., d]
86
+ torch::Tensor& input) // [..., 2 * d]
87
+ {
88
+ LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true);
89
+ }
90
+
91
+ void mul_and_silu(torch::Tensor& out, // [..., d]
92
+ torch::Tensor& input) // [..., 2 * d]
93
+ {
94
+ // The difference between mul_and_silu and silu_and_mul is that mul_and_silu
95
+ // applies the silu to the latter half of the input.
96
+ LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false);
97
+ }
98
+
99
+ void gelu_and_mul(torch::Tensor& out, // [..., d]
100
+ torch::Tensor& input) // [..., 2 * d]
101
+ {
102
+ LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, true);
103
+ }
104
+
105
+ void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
106
+ torch::Tensor& input) // [..., 2 * d]
107
+ {
108
+ LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, true);
109
+ }
110
+
111
+ namespace vllm {
112
+
113
+ template <typename T>
114
+ __device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) {
115
+ const float f = (float)x;
116
+ return (T)(f > threshold ? f : 0.0f);
117
+ }
118
+
119
+ template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
120
+ __global__ void act_and_mul_kernel_with_param(
121
+ scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
122
+ const float param) {
123
+ const int64_t token_idx = blockIdx.x;
124
+ for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
125
+ const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
126
+ const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
127
+ out[token_idx * d + idx] = ACT_FN(x, param) * y;
128
+ }
129
+ }
130
+
131
+ } // namespace vllm
132
+
133
+ #define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \
134
+ int d = input.size(-1) / 2; \
135
+ int64_t num_tokens = input.numel() / input.size(-1); \
136
+ dim3 grid(num_tokens); \
137
+ dim3 block(std::min(d, 1024)); \
138
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
139
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
140
+ VLLM_DISPATCH_FLOATING_TYPES( \
141
+ input.scalar_type(), "act_and_mul_kernel_with_param", [&] { \
142
+ vllm::act_and_mul_kernel_with_param<scalar_t, KERNEL<scalar_t>> \
143
+ <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
144
+ input.data_ptr<scalar_t>(), d, \
145
+ PARAM); \
146
+ });
147
+
148
+ void fatrelu_and_mul(torch::Tensor& out, // [..., d],
149
+ torch::Tensor& input, // [..., 2 * d]
150
+ double threshold) {
151
+ LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
152
+ }
153
+ namespace vllm {
154
+
155
+ // Element-wise activation kernel template.
156
+ template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
157
+ __global__ void activation_kernel(
158
+ scalar_t* __restrict__ out, // [..., d]
159
+ const scalar_t* __restrict__ input, // [..., d]
160
+ const int d) {
161
+ const int64_t token_idx = blockIdx.x;
162
+ for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
163
+ const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
164
+ out[token_idx * d + idx] = ACT_FN(x);
165
+ }
166
+ }
167
+
168
+ } // namespace vllm
169
+
170
+ // Launch element-wise activation kernel.
171
+ #define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
172
+ int d = input.size(-1); \
173
+ int64_t num_tokens = input.numel() / d; \
174
+ dim3 grid(num_tokens); \
175
+ dim3 block(std::min(d, 1024)); \
176
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
177
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
178
+ VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
179
+ vllm::activation_kernel<scalar_t, KERNEL<scalar_t>> \
180
+ <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
181
+ input.data_ptr<scalar_t>(), d); \
182
+ });
183
+
184
+ namespace vllm {
185
+
186
+ template <typename T>
187
+ __device__ __forceinline__ T gelu_new_kernel(const T& x) {
188
+ const float x3 = (float)(x * x * x);
189
+ const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
190
+ return ((T)0.5) * x * (((T)1.0) + t);
191
+ }
192
+
193
+ template <typename T>
194
+ __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
195
+ const float f = (float)x;
196
+ const T t =
197
+ (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
198
+ return ((T)0.5) * x * (((T)1.0) + t);
199
+ }
200
+
201
+ template <typename T>
202
+ __device__ __forceinline__ T gelu_quick_kernel(const T& x) {
203
+ // x * sigmoid(1.702 * x)
204
+ return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x)));
205
+ }
206
+
207
+ } // namespace vllm
208
+
209
+ void gelu_new(torch::Tensor& out, // [..., d]
210
+ torch::Tensor& input) // [..., d]
211
+ {
212
+ LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
213
+ }
214
+
215
+ void gelu_fast(torch::Tensor& out, // [..., d]
216
+ torch::Tensor& input) // [..., d]
217
+ {
218
+ LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
219
+ }
220
+
221
+ void gelu_quick(torch::Tensor& out, // [..., d]
222
+ torch::Tensor& input) // [..., d]
223
+ {
224
+ LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel);
225
+ }
activation/cuda_compat.h ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #ifdef USE_ROCM
4
+ #include <hip/hip_runtime.h>
5
+ #endif
6
+
7
+ #if defined(USE_ROCM) && defined(__GFX9__)
8
+ #define WARP_SIZE 64
9
+ #else
10
+ #define WARP_SIZE 32
11
+ #endif
12
+
13
+ #ifndef USE_ROCM
14
+ #define VLLM_LDG(arg) __ldg(arg)
15
+ #else
16
+ #define VLLM_LDG(arg) *(arg)
17
+ #endif
18
+
19
+ #ifndef USE_ROCM
20
+ #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
21
+ __shfl_xor_sync(uint32_t(-1), var, lane_mask)
22
+ #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
23
+ __shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
24
+ #else
25
+ #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
26
+ #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
27
+ __shfl_xor(var, lane_mask, width)
28
+ #endif
29
+
30
+ #ifndef USE_ROCM
31
+ #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
32
+ #else
33
+ #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
34
+ #endif
35
+
36
+ #ifndef USE_ROCM
37
+ #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \
38
+ __shfl_down_sync(uint32_t(-1), var, lane_delta)
39
+ #else
40
+ #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta)
41
+ #endif
42
+
43
+ #ifndef USE_ROCM
44
+ #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
45
+ cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
46
+ #else
47
+ #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
48
+ hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
49
+ #endif
activation/dispatch_utils.h ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Adapted from
3
+ * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
4
+ */
5
+ #pragma once
6
+
7
+ #include <torch/all.h>
8
+
9
+ // Need a special dispatch case macro since we will nest the FP8 dispatch.
10
+ // Instead of the usual 'scalar_t', this names the dispatched type 'fp8_t'.
11
+ #define AT_DISPATCH_FP8_CASE(enum_type, ...) \
12
+ AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, fp8_t, __VA_ARGS__)
13
+
14
+ #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
15
+ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
16
+ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
17
+ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
18
+
19
+ #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
20
+ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
21
+
22
+ // ROCm devices might use either fn or fnuz, so set up dispatch table for both.
23
+ // A host-based check at runtime will create a preferred FP8 type for ROCm
24
+ // such that the correct kernel is dispatched.
25
+ #ifdef USE_ROCM
26
+ #define VLLM_DISPATCH_CASE_FP8_TYPES(...) \
27
+ AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
28
+ AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__)
29
+
30
+ #define VLLM_DISPATCH_CASE_QUANT_TYPES(...) \
31
+ AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
32
+ AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__) \
33
+ AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
34
+ #else
35
+ #define VLLM_DISPATCH_CASE_FP8_TYPES(...) \
36
+ AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
37
+
38
+ #define VLLM_DISPATCH_CASE_QUANT_TYPES(...) \
39
+ AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
40
+ AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
41
+ #endif
42
+
43
+ // When using this dispatch macro, the type is 'fp8_t' not 'scalar_t'.
44
+ // See AT_DISPATCH_FP8_CASE above.
45
+ #define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \
46
+ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__))
47
+
48
+ #define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
49
+ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
50
+
51
+ #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...) \
52
+ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
53
+ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
54
+ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
55
+ AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
56
+
57
+ #define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \
58
+ AT_DISPATCH_SWITCH(TYPE, NAME, \
59
+ VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
60
+
61
+ #define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...) \
62
+ AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \
63
+ AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \
64
+ AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
65
+ AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \
66
+ AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
67
+
68
+ #define VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(...) \
69
+ AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \
70
+ AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \
71
+ AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
72
+ AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \
73
+ AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) \
74
+ AT_DISPATCH_CASE(at::ScalarType::UInt16, __VA_ARGS__) \
75
+ AT_DISPATCH_CASE(at::ScalarType::UInt32, __VA_ARGS__) \
76
+ AT_DISPATCH_CASE(at::ScalarType::UInt64, __VA_ARGS__)
77
+
78
+ #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
79
+ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
80
+
81
+ #define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \
82
+ AT_DISPATCH_SWITCH( \
83
+ TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))
benchmarks/benchmark.py DELETED
@@ -1,5 +0,0 @@
1
- from kernels.benchmarks import SiluAndMulBenchmark
2
-
3
-
4
- class SiluWorkloads(SiluAndMulBenchmark):
5
- pass
 
 
 
 
 
 
build.toml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [general]
2
+ name = "activation"
3
+ universal = false
4
+
5
+ [torch]
6
+ src = [
7
+ "torch-ext/torch_binding.cpp",
8
+ "torch-ext/torch_binding.h",
9
+ ]
10
+
11
+ [kernel.activation]
12
+ backend = "cuda"
13
+ depends = ["torch"]
14
+ src = [
15
+ "activation/activation_kernels.cu",
16
+ "activation/cuda_compat.h",
17
+ "activation/dispatch_utils.h",
18
+ ]
build/torch210-cu128-x86_64-windows/activation/__init__.py DELETED
@@ -1,75 +0,0 @@
1
- import torch
2
-
3
- from ._ops import ops
4
-
5
- from . import layers
6
-
7
-
8
- def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
9
- ops.silu_and_mul(out, x)
10
- return out
11
-
12
-
13
- def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
14
- ops.mul_and_silu(out, x)
15
- return out
16
-
17
-
18
- def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
19
- ops.gelu_and_mul(out, x)
20
- return out
21
-
22
-
23
- def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
24
- ops.gelu_tanh_and_mul(out, x)
25
- return out
26
-
27
-
28
- def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
29
- ops.fatrelu_and_mul(out, x, threshold)
30
- return out
31
-
32
-
33
- def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
34
- ops.gelu(out, x)
35
- return out
36
-
37
- def silu(out: torch.Tensor, x: torch.Tensor) -> None:
38
- ops.silu(out, x)
39
- return out
40
-
41
-
42
- def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
43
- ops.gelu_tanh(out, x)
44
- return out
45
-
46
-
47
- def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
48
- ops.gelu_fast(out, x)
49
- return out
50
-
51
-
52
- def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
53
- ops.gelu_new(out, x)
54
- return out
55
-
56
-
57
- def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
58
- ops.gelu_quick(out, x)
59
- return out
60
-
61
-
62
- __all__ = [
63
- "silu_and_mul",
64
- "mul_and_silu",
65
- "gelu_and_mul",
66
- "gelu_tanh_and_mul",
67
- "fatrelu_and_mul",
68
- "gelu_fast",
69
- "gelu_new",
70
- "gelu_quick",
71
- "gelu_tanh",
72
- "silu",
73
- "gelu",
74
- "layers",
75
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cu128-x86_64-windows/activation/_ops.py DELETED
@@ -1,9 +0,0 @@
1
- import torch
2
- from . import _activation_e1b4b08
3
- ops = torch.ops._activation_e1b4b08
4
-
5
- def add_op_namespace_prefix(op_name: str):
6
- """
7
- Prefix op by namespace.
8
- """
9
- return f"_activation_e1b4b08::{op_name}"
 
 
 
 
 
 
 
 
 
 
build/torch210-cu128-x86_64-windows/activation/layers.py DELETED
@@ -1,201 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
-
4
- from ._ops import ops
5
-
6
-
7
- class SiluAndMul(nn.Module):
8
- """An activation function for SwiGLU.
9
-
10
- The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
11
-
12
- Shapes:
13
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
14
- return: (num_tokens, d) or (batch_size, seq_len, d)
15
- """
16
-
17
- can_torch_compile: bool = True
18
-
19
- def forward(self, x: torch.Tensor):
20
- if not x.is_contiguous():
21
- x = x.contiguous()
22
- d = x.shape[-1] // 2
23
- output_shape = x.shape[:-1] + (d,)
24
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
25
- ops.silu_and_mul(out, x)
26
- return out
27
-
28
- class Silu(nn.Module):
29
- """An activation function for SiLU.
30
-
31
- The function computes x -> silu(x).
32
-
33
- Shapes:
34
- x: (num_tokens, d) or (batch_size, seq_len, d)
35
- return: (num_tokens, d) or (batch_size, seq_len, d)
36
- """
37
-
38
- can_torch_compile: bool = True
39
-
40
- def forward(self, x: torch.Tensor):
41
- if not x.is_contiguous():
42
- x = x.contiguous()
43
- out = torch.empty_like(x)
44
- ops.silu(out, x)
45
- return out
46
-
47
- class Gelu(nn.Module):
48
- """An activation function for GELU.
49
-
50
- The function computes x -> gelu(x).
51
-
52
- Shapes:
53
- x: (num_tokens, d) or (batch_size, seq_len, d)
54
- return: (num_tokens, d) or (batch_size, seq_len, d)
55
- """
56
-
57
- can_torch_compile: bool = True
58
-
59
- def forward(self, x: torch.Tensor):
60
- if not x.is_contiguous():
61
- x = x.contiguous()
62
- out = torch.empty_like(x)
63
- ops.gelu(out, x)
64
- return out
65
-
66
- class GeluTanh(nn.Module):
67
- """An activation function for GELU with `tanh` approximation.
68
-
69
- The function computes x -> gelu_tanh(x).
70
-
71
- Shapes:
72
- x: (num_tokens, d) or (batch_size, seq_len, d)
73
- return: (num_tokens, d) or (batch_size, seq_len, d)
74
- """
75
-
76
- can_torch_compile: bool = True
77
-
78
- def forward(self, x: torch.Tensor):
79
- if not x.is_contiguous():
80
- x = x.contiguous()
81
- out = torch.empty_like(x)
82
- ops.gelu_tanh(out, x)
83
- return out
84
-
85
-
86
- class MulAndSilu(nn.Module):
87
- """An activation function for SwiGLU.
88
-
89
- The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
90
-
91
- Shapes:
92
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
93
- return: (num_tokens, d) or (batch_size, seq_len, d)
94
- """
95
-
96
- can_torch_compile: bool = True
97
-
98
- def forward(self, x: torch.Tensor) -> torch.Tensor:
99
- if not x.is_contiguous():
100
- x = x.contiguous()
101
- d = x.shape[-1] // 2
102
- output_shape = x.shape[:-1] + (d,)
103
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
104
- ops.mul_and_silu(out, x)
105
- return out
106
-
107
-
108
- class GeluAndMul(nn.Module):
109
- """An activation function for GeGLU.
110
-
111
- The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
112
-
113
- Shapes:
114
- x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
115
- return: (batch_size, seq_len, d) or (num_tokens, d)
116
- """
117
-
118
- can_torch_compile: bool = True
119
-
120
- def forward(self, x: torch.Tensor):
121
- if not x.is_contiguous():
122
- x = x.contiguous()
123
- d = x.shape[-1] // 2
124
- output_shape = x.shape[:-1] + (d,)
125
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
126
- ops.gelu_and_mul(out, x)
127
- return out
128
-
129
-
130
- class GeluTanhAndMul(nn.Module):
131
- can_torch_compile: bool = True
132
-
133
- def forward(self, x: torch.Tensor):
134
- if not x.is_contiguous():
135
- x = x.contiguous()
136
- d = x.shape[-1] // 2
137
- output_shape = x.shape[:-1] + (d,)
138
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
139
- ops.gelu_tanh_and_mul(out, x)
140
- return out
141
-
142
-
143
- class FatreluAndMul(nn.Module):
144
- """An activation function for FATReLU.
145
-
146
- The function computes x -> FATReLU(x[:d]) * x[d:] where
147
- d = x.shape[-1] // 2.
148
- This is used in openbmb/MiniCPM-S-1B-sft.
149
-
150
- Shapes:
151
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
152
- return: (num_tokens, d) or (batch_size, seq_len, d)
153
- """
154
-
155
- can_torch_compile: bool = True
156
-
157
- def __init__(self, threshold: float = 0.0):
158
- super().__init__()
159
- self.threshold = threshold
160
-
161
- def forward(self, x: torch.Tensor):
162
- if not x.is_contiguous():
163
- x = x.contiguous()
164
- d = x.shape[-1] // 2
165
- output_shape = x.shape[:-1] + (d,)
166
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
167
- ops.fatrelu_and_mul(out, x, self.threshold)
168
- return out
169
-
170
-
171
- class FastGELU(nn.Module):
172
- can_torch_compile: bool = True
173
-
174
- def forward(self, x: torch.Tensor) -> torch.Tensor:
175
- if not x.is_contiguous():
176
- x = x.contiguous()
177
- out = torch.empty_like(x)
178
- ops.gelu_fast(out, x)
179
- return out
180
-
181
-
182
- class NewGELU(nn.Module):
183
- can_torch_compile: bool = True
184
-
185
- def forward(self, x: torch.Tensor) -> torch.Tensor:
186
- if not x.is_contiguous():
187
- x = x.contiguous()
188
- out = torch.empty_like(x)
189
- ops.gelu_new(out, x)
190
- return out
191
-
192
-
193
- class QuickGELU(nn.Module):
194
- can_torch_compile: bool = True
195
-
196
- def forward(self, x: torch.Tensor) -> torch.Tensor:
197
- if not x.is_contiguous():
198
- x = x.contiguous()
199
- out = torch.empty_like(x)
200
- ops.gelu_quick(out, x)
201
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cu128-x86_64-windows/metadata.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "version": 1,
3
- "python-depends": []
4
- }
 
 
 
 
 
build/torch210-cxx11-cu126-aarch64-linux/activation/__init__.py DELETED
@@ -1,26 +0,0 @@
1
- import ctypes
2
- import importlib.util
3
- import sys
4
- from pathlib import Path
5
- from types import ModuleType
6
-
7
-
8
- def _import_from_path(file_path: Path) -> ModuleType:
9
- # We cannot use the module name as-is, after adding it to `sys.modules`,
10
- # it would also be used for other imports. So, we make a module name that
11
- # depends on the path for it to be unique using the hex-encoded hash of
12
- # the path.
13
- path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
- module_name = path_hash
15
- spec = importlib.util.spec_from_file_location(module_name, file_path)
16
- if spec is None:
17
- raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
- module = importlib.util.module_from_spec(spec)
19
- if module is None:
20
- raise ImportError(f"Cannot load module {module_name} from spec")
21
- sys.modules[module_name] = module
22
- spec.loader.exec_module(module) # type: ignore
23
- return module
24
-
25
-
26
- globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu126-aarch64-linux/layers.py DELETED
@@ -1,201 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
-
4
- from ._ops import ops
5
-
6
-
7
- class SiluAndMul(nn.Module):
8
- """An activation function for SwiGLU.
9
-
10
- The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
11
-
12
- Shapes:
13
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
14
- return: (num_tokens, d) or (batch_size, seq_len, d)
15
- """
16
-
17
- can_torch_compile: bool = True
18
-
19
- def forward(self, x: torch.Tensor):
20
- if not x.is_contiguous():
21
- x = x.contiguous()
22
- d = x.shape[-1] // 2
23
- output_shape = x.shape[:-1] + (d,)
24
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
25
- ops.silu_and_mul(out, x)
26
- return out
27
-
28
- class Silu(nn.Module):
29
- """An activation function for SiLU.
30
-
31
- The function computes x -> silu(x).
32
-
33
- Shapes:
34
- x: (num_tokens, d) or (batch_size, seq_len, d)
35
- return: (num_tokens, d) or (batch_size, seq_len, d)
36
- """
37
-
38
- can_torch_compile: bool = True
39
-
40
- def forward(self, x: torch.Tensor):
41
- if not x.is_contiguous():
42
- x = x.contiguous()
43
- out = torch.empty_like(x)
44
- ops.silu(out, x)
45
- return out
46
-
47
- class Gelu(nn.Module):
48
- """An activation function for GELU.
49
-
50
- The function computes x -> gelu(x).
51
-
52
- Shapes:
53
- x: (num_tokens, d) or (batch_size, seq_len, d)
54
- return: (num_tokens, d) or (batch_size, seq_len, d)
55
- """
56
-
57
- can_torch_compile: bool = True
58
-
59
- def forward(self, x: torch.Tensor):
60
- if not x.is_contiguous():
61
- x = x.contiguous()
62
- out = torch.empty_like(x)
63
- ops.gelu(out, x)
64
- return out
65
-
66
- class GeluTanh(nn.Module):
67
- """An activation function for GELU with `tanh` approximation.
68
-
69
- The function computes x -> gelu_tanh(x).
70
-
71
- Shapes:
72
- x: (num_tokens, d) or (batch_size, seq_len, d)
73
- return: (num_tokens, d) or (batch_size, seq_len, d)
74
- """
75
-
76
- can_torch_compile: bool = True
77
-
78
- def forward(self, x: torch.Tensor):
79
- if not x.is_contiguous():
80
- x = x.contiguous()
81
- out = torch.empty_like(x)
82
- ops.gelu_tanh(out, x)
83
- return out
84
-
85
-
86
- class MulAndSilu(nn.Module):
87
- """An activation function for SwiGLU.
88
-
89
- The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
90
-
91
- Shapes:
92
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
93
- return: (num_tokens, d) or (batch_size, seq_len, d)
94
- """
95
-
96
- can_torch_compile: bool = True
97
-
98
- def forward(self, x: torch.Tensor) -> torch.Tensor:
99
- if not x.is_contiguous():
100
- x = x.contiguous()
101
- d = x.shape[-1] // 2
102
- output_shape = x.shape[:-1] + (d,)
103
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
104
- ops.mul_and_silu(out, x)
105
- return out
106
-
107
-
108
- class GeluAndMul(nn.Module):
109
- """An activation function for GeGLU.
110
-
111
- The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
112
-
113
- Shapes:
114
- x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
115
- return: (batch_size, seq_len, d) or (num_tokens, d)
116
- """
117
-
118
- can_torch_compile: bool = True
119
-
120
- def forward(self, x: torch.Tensor):
121
- if not x.is_contiguous():
122
- x = x.contiguous()
123
- d = x.shape[-1] // 2
124
- output_shape = x.shape[:-1] + (d,)
125
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
126
- ops.gelu_and_mul(out, x)
127
- return out
128
-
129
-
130
- class GeluTanhAndMul(nn.Module):
131
- can_torch_compile: bool = True
132
-
133
- def forward(self, x: torch.Tensor):
134
- if not x.is_contiguous():
135
- x = x.contiguous()
136
- d = x.shape[-1] // 2
137
- output_shape = x.shape[:-1] + (d,)
138
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
139
- ops.gelu_tanh_and_mul(out, x)
140
- return out
141
-
142
-
143
- class FatreluAndMul(nn.Module):
144
- """An activation function for FATReLU.
145
-
146
- The function computes x -> FATReLU(x[:d]) * x[d:] where
147
- d = x.shape[-1] // 2.
148
- This is used in openbmb/MiniCPM-S-1B-sft.
149
-
150
- Shapes:
151
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
152
- return: (num_tokens, d) or (batch_size, seq_len, d)
153
- """
154
-
155
- can_torch_compile: bool = True
156
-
157
- def __init__(self, threshold: float = 0.0):
158
- super().__init__()
159
- self.threshold = threshold
160
-
161
- def forward(self, x: torch.Tensor):
162
- if not x.is_contiguous():
163
- x = x.contiguous()
164
- d = x.shape[-1] // 2
165
- output_shape = x.shape[:-1] + (d,)
166
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
167
- ops.fatrelu_and_mul(out, x, self.threshold)
168
- return out
169
-
170
-
171
- class FastGELU(nn.Module):
172
- can_torch_compile: bool = True
173
-
174
- def forward(self, x: torch.Tensor) -> torch.Tensor:
175
- if not x.is_contiguous():
176
- x = x.contiguous()
177
- out = torch.empty_like(x)
178
- ops.gelu_fast(out, x)
179
- return out
180
-
181
-
182
- class NewGELU(nn.Module):
183
- can_torch_compile: bool = True
184
-
185
- def forward(self, x: torch.Tensor) -> torch.Tensor:
186
- if not x.is_contiguous():
187
- x = x.contiguous()
188
- out = torch.empty_like(x)
189
- ops.gelu_new(out, x)
190
- return out
191
-
192
-
193
- class QuickGELU(nn.Module):
194
- can_torch_compile: bool = True
195
-
196
- def forward(self, x: torch.Tensor) -> torch.Tensor:
197
- if not x.is_contiguous():
198
- x = x.contiguous()
199
- out = torch.empty_like(x)
200
- ops.gelu_quick(out, x)
201
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu126-aarch64-linux/metadata.json DELETED
@@ -1,18 +0,0 @@
1
- {
2
- "version": 1,
3
- "license": "Apache-2.0",
4
- "python-depends": [],
5
- "backend": {
6
- "type": "cuda",
7
- "archs": [
8
- "7.0",
9
- "7.2",
10
- "7.5",
11
- "8.0",
12
- "8.6",
13
- "8.7",
14
- "8.9",
15
- "9.0+PTX"
16
- ]
17
- }
18
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu126-x86_64-linux/activation/__init__.py DELETED
@@ -1,26 +0,0 @@
1
- import ctypes
2
- import importlib.util
3
- import sys
4
- from pathlib import Path
5
- from types import ModuleType
6
-
7
-
8
- def _import_from_path(file_path: Path) -> ModuleType:
9
- # We cannot use the module name as-is, after adding it to `sys.modules`,
10
- # it would also be used for other imports. So, we make a module name that
11
- # depends on the path for it to be unique using the hex-encoded hash of
12
- # the path.
13
- path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
- module_name = path_hash
15
- spec = importlib.util.spec_from_file_location(module_name, file_path)
16
- if spec is None:
17
- raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
- module = importlib.util.module_from_spec(spec)
19
- if module is None:
20
- raise ImportError(f"Cannot load module {module_name} from spec")
21
- sys.modules[module_name] = module
22
- spec.loader.exec_module(module) # type: ignore
23
- return module
24
-
25
-
26
- globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu126-x86_64-linux/layers.py DELETED
@@ -1,201 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
-
4
- from ._ops import ops
5
-
6
-
7
- class SiluAndMul(nn.Module):
8
- """An activation function for SwiGLU.
9
-
10
- The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
11
-
12
- Shapes:
13
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
14
- return: (num_tokens, d) or (batch_size, seq_len, d)
15
- """
16
-
17
- can_torch_compile: bool = True
18
-
19
- def forward(self, x: torch.Tensor):
20
- if not x.is_contiguous():
21
- x = x.contiguous()
22
- d = x.shape[-1] // 2
23
- output_shape = x.shape[:-1] + (d,)
24
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
25
- ops.silu_and_mul(out, x)
26
- return out
27
-
28
- class Silu(nn.Module):
29
- """An activation function for SiLU.
30
-
31
- The function computes x -> silu(x).
32
-
33
- Shapes:
34
- x: (num_tokens, d) or (batch_size, seq_len, d)
35
- return: (num_tokens, d) or (batch_size, seq_len, d)
36
- """
37
-
38
- can_torch_compile: bool = True
39
-
40
- def forward(self, x: torch.Tensor):
41
- if not x.is_contiguous():
42
- x = x.contiguous()
43
- out = torch.empty_like(x)
44
- ops.silu(out, x)
45
- return out
46
-
47
- class Gelu(nn.Module):
48
- """An activation function for GELU.
49
-
50
- The function computes x -> gelu(x).
51
-
52
- Shapes:
53
- x: (num_tokens, d) or (batch_size, seq_len, d)
54
- return: (num_tokens, d) or (batch_size, seq_len, d)
55
- """
56
-
57
- can_torch_compile: bool = True
58
-
59
- def forward(self, x: torch.Tensor):
60
- if not x.is_contiguous():
61
- x = x.contiguous()
62
- out = torch.empty_like(x)
63
- ops.gelu(out, x)
64
- return out
65
-
66
- class GeluTanh(nn.Module):
67
- """An activation function for GELU with `tanh` approximation.
68
-
69
- The function computes x -> gelu_tanh(x).
70
-
71
- Shapes:
72
- x: (num_tokens, d) or (batch_size, seq_len, d)
73
- return: (num_tokens, d) or (batch_size, seq_len, d)
74
- """
75
-
76
- can_torch_compile: bool = True
77
-
78
- def forward(self, x: torch.Tensor):
79
- if not x.is_contiguous():
80
- x = x.contiguous()
81
- out = torch.empty_like(x)
82
- ops.gelu_tanh(out, x)
83
- return out
84
-
85
-
86
- class MulAndSilu(nn.Module):
87
- """An activation function for SwiGLU.
88
-
89
- The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
90
-
91
- Shapes:
92
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
93
- return: (num_tokens, d) or (batch_size, seq_len, d)
94
- """
95
-
96
- can_torch_compile: bool = True
97
-
98
- def forward(self, x: torch.Tensor) -> torch.Tensor:
99
- if not x.is_contiguous():
100
- x = x.contiguous()
101
- d = x.shape[-1] // 2
102
- output_shape = x.shape[:-1] + (d,)
103
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
104
- ops.mul_and_silu(out, x)
105
- return out
106
-
107
-
108
- class GeluAndMul(nn.Module):
109
- """An activation function for GeGLU.
110
-
111
- The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
112
-
113
- Shapes:
114
- x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
115
- return: (batch_size, seq_len, d) or (num_tokens, d)
116
- """
117
-
118
- can_torch_compile: bool = True
119
-
120
- def forward(self, x: torch.Tensor):
121
- if not x.is_contiguous():
122
- x = x.contiguous()
123
- d = x.shape[-1] // 2
124
- output_shape = x.shape[:-1] + (d,)
125
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
126
- ops.gelu_and_mul(out, x)
127
- return out
128
-
129
-
130
- class GeluTanhAndMul(nn.Module):
131
- can_torch_compile: bool = True
132
-
133
- def forward(self, x: torch.Tensor):
134
- if not x.is_contiguous():
135
- x = x.contiguous()
136
- d = x.shape[-1] // 2
137
- output_shape = x.shape[:-1] + (d,)
138
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
139
- ops.gelu_tanh_and_mul(out, x)
140
- return out
141
-
142
-
143
- class FatreluAndMul(nn.Module):
144
- """An activation function for FATReLU.
145
-
146
- The function computes x -> FATReLU(x[:d]) * x[d:] where
147
- d = x.shape[-1] // 2.
148
- This is used in openbmb/MiniCPM-S-1B-sft.
149
-
150
- Shapes:
151
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
152
- return: (num_tokens, d) or (batch_size, seq_len, d)
153
- """
154
-
155
- can_torch_compile: bool = True
156
-
157
- def __init__(self, threshold: float = 0.0):
158
- super().__init__()
159
- self.threshold = threshold
160
-
161
- def forward(self, x: torch.Tensor):
162
- if not x.is_contiguous():
163
- x = x.contiguous()
164
- d = x.shape[-1] // 2
165
- output_shape = x.shape[:-1] + (d,)
166
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
167
- ops.fatrelu_and_mul(out, x, self.threshold)
168
- return out
169
-
170
-
171
- class FastGELU(nn.Module):
172
- can_torch_compile: bool = True
173
-
174
- def forward(self, x: torch.Tensor) -> torch.Tensor:
175
- if not x.is_contiguous():
176
- x = x.contiguous()
177
- out = torch.empty_like(x)
178
- ops.gelu_fast(out, x)
179
- return out
180
-
181
-
182
- class NewGELU(nn.Module):
183
- can_torch_compile: bool = True
184
-
185
- def forward(self, x: torch.Tensor) -> torch.Tensor:
186
- if not x.is_contiguous():
187
- x = x.contiguous()
188
- out = torch.empty_like(x)
189
- ops.gelu_new(out, x)
190
- return out
191
-
192
-
193
- class QuickGELU(nn.Module):
194
- can_torch_compile: bool = True
195
-
196
- def forward(self, x: torch.Tensor) -> torch.Tensor:
197
- if not x.is_contiguous():
198
- x = x.contiguous()
199
- out = torch.empty_like(x)
200
- ops.gelu_quick(out, x)
201
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu126-x86_64-linux/metadata.json DELETED
@@ -1,18 +0,0 @@
1
- {
2
- "version": 1,
3
- "license": "Apache-2.0",
4
- "python-depends": [],
5
- "backend": {
6
- "type": "cuda",
7
- "archs": [
8
- "7.0",
9
- "7.2",
10
- "7.5",
11
- "8.0",
12
- "8.6",
13
- "8.7",
14
- "8.9",
15
- "9.0+PTX"
16
- ]
17
- }
18
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu128-aarch64-linux/activation/__init__.py DELETED
@@ -1,26 +0,0 @@
1
- import ctypes
2
- import importlib.util
3
- import sys
4
- from pathlib import Path
5
- from types import ModuleType
6
-
7
-
8
- def _import_from_path(file_path: Path) -> ModuleType:
9
- # We cannot use the module name as-is, after adding it to `sys.modules`,
10
- # it would also be used for other imports. So, we make a module name that
11
- # depends on the path for it to be unique using the hex-encoded hash of
12
- # the path.
13
- path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
- module_name = path_hash
15
- spec = importlib.util.spec_from_file_location(module_name, file_path)
16
- if spec is None:
17
- raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
- module = importlib.util.module_from_spec(spec)
19
- if module is None:
20
- raise ImportError(f"Cannot load module {module_name} from spec")
21
- sys.modules[module_name] = module
22
- spec.loader.exec_module(module) # type: ignore
23
- return module
24
-
25
-
26
- globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu128-aarch64-linux/layers.py DELETED
@@ -1,201 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
-
4
- from ._ops import ops
5
-
6
-
7
- class SiluAndMul(nn.Module):
8
- """An activation function for SwiGLU.
9
-
10
- The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
11
-
12
- Shapes:
13
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
14
- return: (num_tokens, d) or (batch_size, seq_len, d)
15
- """
16
-
17
- can_torch_compile: bool = True
18
-
19
- def forward(self, x: torch.Tensor):
20
- if not x.is_contiguous():
21
- x = x.contiguous()
22
- d = x.shape[-1] // 2
23
- output_shape = x.shape[:-1] + (d,)
24
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
25
- ops.silu_and_mul(out, x)
26
- return out
27
-
28
- class Silu(nn.Module):
29
- """An activation function for SiLU.
30
-
31
- The function computes x -> silu(x).
32
-
33
- Shapes:
34
- x: (num_tokens, d) or (batch_size, seq_len, d)
35
- return: (num_tokens, d) or (batch_size, seq_len, d)
36
- """
37
-
38
- can_torch_compile: bool = True
39
-
40
- def forward(self, x: torch.Tensor):
41
- if not x.is_contiguous():
42
- x = x.contiguous()
43
- out = torch.empty_like(x)
44
- ops.silu(out, x)
45
- return out
46
-
47
- class Gelu(nn.Module):
48
- """An activation function for GELU.
49
-
50
- The function computes x -> gelu(x).
51
-
52
- Shapes:
53
- x: (num_tokens, d) or (batch_size, seq_len, d)
54
- return: (num_tokens, d) or (batch_size, seq_len, d)
55
- """
56
-
57
- can_torch_compile: bool = True
58
-
59
- def forward(self, x: torch.Tensor):
60
- if not x.is_contiguous():
61
- x = x.contiguous()
62
- out = torch.empty_like(x)
63
- ops.gelu(out, x)
64
- return out
65
-
66
- class GeluTanh(nn.Module):
67
- """An activation function for GELU with `tanh` approximation.
68
-
69
- The function computes x -> gelu_tanh(x).
70
-
71
- Shapes:
72
- x: (num_tokens, d) or (batch_size, seq_len, d)
73
- return: (num_tokens, d) or (batch_size, seq_len, d)
74
- """
75
-
76
- can_torch_compile: bool = True
77
-
78
- def forward(self, x: torch.Tensor):
79
- if not x.is_contiguous():
80
- x = x.contiguous()
81
- out = torch.empty_like(x)
82
- ops.gelu_tanh(out, x)
83
- return out
84
-
85
-
86
- class MulAndSilu(nn.Module):
87
- """An activation function for SwiGLU.
88
-
89
- The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
90
-
91
- Shapes:
92
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
93
- return: (num_tokens, d) or (batch_size, seq_len, d)
94
- """
95
-
96
- can_torch_compile: bool = True
97
-
98
- def forward(self, x: torch.Tensor) -> torch.Tensor:
99
- if not x.is_contiguous():
100
- x = x.contiguous()
101
- d = x.shape[-1] // 2
102
- output_shape = x.shape[:-1] + (d,)
103
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
104
- ops.mul_and_silu(out, x)
105
- return out
106
-
107
-
108
- class GeluAndMul(nn.Module):
109
- """An activation function for GeGLU.
110
-
111
- The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
112
-
113
- Shapes:
114
- x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
115
- return: (batch_size, seq_len, d) or (num_tokens, d)
116
- """
117
-
118
- can_torch_compile: bool = True
119
-
120
- def forward(self, x: torch.Tensor):
121
- if not x.is_contiguous():
122
- x = x.contiguous()
123
- d = x.shape[-1] // 2
124
- output_shape = x.shape[:-1] + (d,)
125
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
126
- ops.gelu_and_mul(out, x)
127
- return out
128
-
129
-
130
- class GeluTanhAndMul(nn.Module):
131
- can_torch_compile: bool = True
132
-
133
- def forward(self, x: torch.Tensor):
134
- if not x.is_contiguous():
135
- x = x.contiguous()
136
- d = x.shape[-1] // 2
137
- output_shape = x.shape[:-1] + (d,)
138
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
139
- ops.gelu_tanh_and_mul(out, x)
140
- return out
141
-
142
-
143
- class FatreluAndMul(nn.Module):
144
- """An activation function for FATReLU.
145
-
146
- The function computes x -> FATReLU(x[:d]) * x[d:] where
147
- d = x.shape[-1] // 2.
148
- This is used in openbmb/MiniCPM-S-1B-sft.
149
-
150
- Shapes:
151
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
152
- return: (num_tokens, d) or (batch_size, seq_len, d)
153
- """
154
-
155
- can_torch_compile: bool = True
156
-
157
- def __init__(self, threshold: float = 0.0):
158
- super().__init__()
159
- self.threshold = threshold
160
-
161
- def forward(self, x: torch.Tensor):
162
- if not x.is_contiguous():
163
- x = x.contiguous()
164
- d = x.shape[-1] // 2
165
- output_shape = x.shape[:-1] + (d,)
166
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
167
- ops.fatrelu_and_mul(out, x, self.threshold)
168
- return out
169
-
170
-
171
- class FastGELU(nn.Module):
172
- can_torch_compile: bool = True
173
-
174
- def forward(self, x: torch.Tensor) -> torch.Tensor:
175
- if not x.is_contiguous():
176
- x = x.contiguous()
177
- out = torch.empty_like(x)
178
- ops.gelu_fast(out, x)
179
- return out
180
-
181
-
182
- class NewGELU(nn.Module):
183
- can_torch_compile: bool = True
184
-
185
- def forward(self, x: torch.Tensor) -> torch.Tensor:
186
- if not x.is_contiguous():
187
- x = x.contiguous()
188
- out = torch.empty_like(x)
189
- ops.gelu_new(out, x)
190
- return out
191
-
192
-
193
- class QuickGELU(nn.Module):
194
- can_torch_compile: bool = True
195
-
196
- def forward(self, x: torch.Tensor) -> torch.Tensor:
197
- if not x.is_contiguous():
198
- x = x.contiguous()
199
- out = torch.empty_like(x)
200
- ops.gelu_quick(out, x)
201
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu128-aarch64-linux/metadata.json DELETED
@@ -1,21 +0,0 @@
1
- {
2
- "version": 1,
3
- "license": "Apache-2.0",
4
- "python-depends": [],
5
- "backend": {
6
- "type": "cuda",
7
- "archs": [
8
- "10.0",
9
- "10.1",
10
- "12.0+PTX",
11
- "7.0",
12
- "7.2",
13
- "7.5",
14
- "8.0",
15
- "8.6",
16
- "8.7",
17
- "8.9",
18
- "9.0"
19
- ]
20
- }
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu128-x86_64-linux/_activation_cuda_ccf9ce9.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2946ede3bdc0bffe6e0c1f68dbcdbbdcfcd9c8f2d53723f17159005aac6b247
3
- size 4406632
 
 
 
 
build/torch210-cxx11-cu128-x86_64-linux/_ops.py DELETED
@@ -1,9 +0,0 @@
1
- import torch
2
- from . import _activation_cuda_ccf9ce9
3
- ops = torch.ops._activation_cuda_ccf9ce9
4
-
5
- def add_op_namespace_prefix(op_name: str):
6
- """
7
- Prefix op by namespace.
8
- """
9
- return f"_activation_cuda_ccf9ce9::{op_name}"
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu128-x86_64-linux/activation/__init__.py DELETED
@@ -1,26 +0,0 @@
1
- import ctypes
2
- import importlib.util
3
- import sys
4
- from pathlib import Path
5
- from types import ModuleType
6
-
7
-
8
- def _import_from_path(file_path: Path) -> ModuleType:
9
- # We cannot use the module name as-is, after adding it to `sys.modules`,
10
- # it would also be used for other imports. So, we make a module name that
11
- # depends on the path for it to be unique using the hex-encoded hash of
12
- # the path.
13
- path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
- module_name = path_hash
15
- spec = importlib.util.spec_from_file_location(module_name, file_path)
16
- if spec is None:
17
- raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
- module = importlib.util.module_from_spec(spec)
19
- if module is None:
20
- raise ImportError(f"Cannot load module {module_name} from spec")
21
- sys.modules[module_name] = module
22
- spec.loader.exec_module(module) # type: ignore
23
- return module
24
-
25
-
26
- globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu128-x86_64-linux/layers.py DELETED
@@ -1,201 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
-
4
- from ._ops import ops
5
-
6
-
7
- class SiluAndMul(nn.Module):
8
- """An activation function for SwiGLU.
9
-
10
- The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
11
-
12
- Shapes:
13
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
14
- return: (num_tokens, d) or (batch_size, seq_len, d)
15
- """
16
-
17
- can_torch_compile: bool = True
18
-
19
- def forward(self, x: torch.Tensor):
20
- if not x.is_contiguous():
21
- x = x.contiguous()
22
- d = x.shape[-1] // 2
23
- output_shape = x.shape[:-1] + (d,)
24
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
25
- ops.silu_and_mul(out, x)
26
- return out
27
-
28
- class Silu(nn.Module):
29
- """An activation function for SiLU.
30
-
31
- The function computes x -> silu(x).
32
-
33
- Shapes:
34
- x: (num_tokens, d) or (batch_size, seq_len, d)
35
- return: (num_tokens, d) or (batch_size, seq_len, d)
36
- """
37
-
38
- can_torch_compile: bool = True
39
-
40
- def forward(self, x: torch.Tensor):
41
- if not x.is_contiguous():
42
- x = x.contiguous()
43
- out = torch.empty_like(x)
44
- ops.silu(out, x)
45
- return out
46
-
47
- class Gelu(nn.Module):
48
- """An activation function for GELU.
49
-
50
- The function computes x -> gelu(x).
51
-
52
- Shapes:
53
- x: (num_tokens, d) or (batch_size, seq_len, d)
54
- return: (num_tokens, d) or (batch_size, seq_len, d)
55
- """
56
-
57
- can_torch_compile: bool = True
58
-
59
- def forward(self, x: torch.Tensor):
60
- if not x.is_contiguous():
61
- x = x.contiguous()
62
- out = torch.empty_like(x)
63
- ops.gelu(out, x)
64
- return out
65
-
66
- class GeluTanh(nn.Module):
67
- """An activation function for GELU with `tanh` approximation.
68
-
69
- The function computes x -> gelu_tanh(x).
70
-
71
- Shapes:
72
- x: (num_tokens, d) or (batch_size, seq_len, d)
73
- return: (num_tokens, d) or (batch_size, seq_len, d)
74
- """
75
-
76
- can_torch_compile: bool = True
77
-
78
- def forward(self, x: torch.Tensor):
79
- if not x.is_contiguous():
80
- x = x.contiguous()
81
- out = torch.empty_like(x)
82
- ops.gelu_tanh(out, x)
83
- return out
84
-
85
-
86
- class MulAndSilu(nn.Module):
87
- """An activation function for SwiGLU.
88
-
89
- The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
90
-
91
- Shapes:
92
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
93
- return: (num_tokens, d) or (batch_size, seq_len, d)
94
- """
95
-
96
- can_torch_compile: bool = True
97
-
98
- def forward(self, x: torch.Tensor) -> torch.Tensor:
99
- if not x.is_contiguous():
100
- x = x.contiguous()
101
- d = x.shape[-1] // 2
102
- output_shape = x.shape[:-1] + (d,)
103
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
104
- ops.mul_and_silu(out, x)
105
- return out
106
-
107
-
108
- class GeluAndMul(nn.Module):
109
- """An activation function for GeGLU.
110
-
111
- The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
112
-
113
- Shapes:
114
- x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
115
- return: (batch_size, seq_len, d) or (num_tokens, d)
116
- """
117
-
118
- can_torch_compile: bool = True
119
-
120
- def forward(self, x: torch.Tensor):
121
- if not x.is_contiguous():
122
- x = x.contiguous()
123
- d = x.shape[-1] // 2
124
- output_shape = x.shape[:-1] + (d,)
125
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
126
- ops.gelu_and_mul(out, x)
127
- return out
128
-
129
-
130
- class GeluTanhAndMul(nn.Module):
131
- can_torch_compile: bool = True
132
-
133
- def forward(self, x: torch.Tensor):
134
- if not x.is_contiguous():
135
- x = x.contiguous()
136
- d = x.shape[-1] // 2
137
- output_shape = x.shape[:-1] + (d,)
138
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
139
- ops.gelu_tanh_and_mul(out, x)
140
- return out
141
-
142
-
143
- class FatreluAndMul(nn.Module):
144
- """An activation function for FATReLU.
145
-
146
- The function computes x -> FATReLU(x[:d]) * x[d:] where
147
- d = x.shape[-1] // 2.
148
- This is used in openbmb/MiniCPM-S-1B-sft.
149
-
150
- Shapes:
151
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
152
- return: (num_tokens, d) or (batch_size, seq_len, d)
153
- """
154
-
155
- can_torch_compile: bool = True
156
-
157
- def __init__(self, threshold: float = 0.0):
158
- super().__init__()
159
- self.threshold = threshold
160
-
161
- def forward(self, x: torch.Tensor):
162
- if not x.is_contiguous():
163
- x = x.contiguous()
164
- d = x.shape[-1] // 2
165
- output_shape = x.shape[:-1] + (d,)
166
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
167
- ops.fatrelu_and_mul(out, x, self.threshold)
168
- return out
169
-
170
-
171
- class FastGELU(nn.Module):
172
- can_torch_compile: bool = True
173
-
174
- def forward(self, x: torch.Tensor) -> torch.Tensor:
175
- if not x.is_contiguous():
176
- x = x.contiguous()
177
- out = torch.empty_like(x)
178
- ops.gelu_fast(out, x)
179
- return out
180
-
181
-
182
- class NewGELU(nn.Module):
183
- can_torch_compile: bool = True
184
-
185
- def forward(self, x: torch.Tensor) -> torch.Tensor:
186
- if not x.is_contiguous():
187
- x = x.contiguous()
188
- out = torch.empty_like(x)
189
- ops.gelu_new(out, x)
190
- return out
191
-
192
-
193
- class QuickGELU(nn.Module):
194
- can_torch_compile: bool = True
195
-
196
- def forward(self, x: torch.Tensor) -> torch.Tensor:
197
- if not x.is_contiguous():
198
- x = x.contiguous()
199
- out = torch.empty_like(x)
200
- ops.gelu_quick(out, x)
201
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu128-x86_64-linux/metadata.json DELETED
@@ -1,21 +0,0 @@
1
- {
2
- "version": 1,
3
- "license": "Apache-2.0",
4
- "python-depends": [],
5
- "backend": {
6
- "type": "cuda",
7
- "archs": [
8
- "10.0",
9
- "10.1",
10
- "12.0+PTX",
11
- "7.0",
12
- "7.2",
13
- "7.5",
14
- "8.0",
15
- "8.6",
16
- "8.7",
17
- "8.9",
18
- "9.0"
19
- ]
20
- }
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu130-aarch64-linux/__init__.py DELETED
@@ -1,75 +0,0 @@
1
- import torch
2
-
3
- from ._ops import ops
4
-
5
- from . import layers
6
-
7
-
8
- def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
9
- ops.silu_and_mul(out, x)
10
- return out
11
-
12
-
13
- def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
14
- ops.mul_and_silu(out, x)
15
- return out
16
-
17
-
18
- def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
19
- ops.gelu_and_mul(out, x)
20
- return out
21
-
22
-
23
- def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
24
- ops.gelu_tanh_and_mul(out, x)
25
- return out
26
-
27
-
28
- def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
29
- ops.fatrelu_and_mul(out, x, threshold)
30
- return out
31
-
32
-
33
- def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
34
- ops.gelu(out, x)
35
- return out
36
-
37
- def silu(out: torch.Tensor, x: torch.Tensor) -> None:
38
- ops.silu(out, x)
39
- return out
40
-
41
-
42
- def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
43
- ops.gelu_tanh(out, x)
44
- return out
45
-
46
-
47
- def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
48
- ops.gelu_fast(out, x)
49
- return out
50
-
51
-
52
- def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
53
- ops.gelu_new(out, x)
54
- return out
55
-
56
-
57
- def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
58
- ops.gelu_quick(out, x)
59
- return out
60
-
61
-
62
- __all__ = [
63
- "silu_and_mul",
64
- "mul_and_silu",
65
- "gelu_and_mul",
66
- "gelu_tanh_and_mul",
67
- "fatrelu_and_mul",
68
- "gelu_fast",
69
- "gelu_new",
70
- "gelu_quick",
71
- "gelu_tanh",
72
- "silu",
73
- "gelu",
74
- "layers",
75
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu130-aarch64-linux/_activation_cuda_ccf9ce9.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c836df90ec0e7853341065857441a9a48891ae86a15d5fd0d044fb03b6320ea
3
- size 4294136
 
 
 
 
build/torch210-cxx11-cu130-aarch64-linux/_ops.py DELETED
@@ -1,9 +0,0 @@
1
- import torch
2
- from . import _activation_cuda_ccf9ce9
3
- ops = torch.ops._activation_cuda_ccf9ce9
4
-
5
- def add_op_namespace_prefix(op_name: str):
6
- """
7
- Prefix op by namespace.
8
- """
9
- return f"_activation_cuda_ccf9ce9::{op_name}"
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu130-aarch64-linux/activation/__init__.py DELETED
@@ -1,26 +0,0 @@
1
- import ctypes
2
- import importlib.util
3
- import sys
4
- from pathlib import Path
5
- from types import ModuleType
6
-
7
-
8
- def _import_from_path(file_path: Path) -> ModuleType:
9
- # We cannot use the module name as-is, after adding it to `sys.modules`,
10
- # it would also be used for other imports. So, we make a module name that
11
- # depends on the path for it to be unique using the hex-encoded hash of
12
- # the path.
13
- path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
- module_name = path_hash
15
- spec = importlib.util.spec_from_file_location(module_name, file_path)
16
- if spec is None:
17
- raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
- module = importlib.util.module_from_spec(spec)
19
- if module is None:
20
- raise ImportError(f"Cannot load module {module_name} from spec")
21
- sys.modules[module_name] = module
22
- spec.loader.exec_module(module) # type: ignore
23
- return module
24
-
25
-
26
- globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu130-aarch64-linux/layers.py DELETED
@@ -1,201 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
-
4
- from ._ops import ops
5
-
6
-
7
- class SiluAndMul(nn.Module):
8
- """An activation function for SwiGLU.
9
-
10
- The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
11
-
12
- Shapes:
13
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
14
- return: (num_tokens, d) or (batch_size, seq_len, d)
15
- """
16
-
17
- can_torch_compile: bool = True
18
-
19
- def forward(self, x: torch.Tensor):
20
- if not x.is_contiguous():
21
- x = x.contiguous()
22
- d = x.shape[-1] // 2
23
- output_shape = x.shape[:-1] + (d,)
24
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
25
- ops.silu_and_mul(out, x)
26
- return out
27
-
28
- class Silu(nn.Module):
29
- """An activation function for SiLU.
30
-
31
- The function computes x -> silu(x).
32
-
33
- Shapes:
34
- x: (num_tokens, d) or (batch_size, seq_len, d)
35
- return: (num_tokens, d) or (batch_size, seq_len, d)
36
- """
37
-
38
- can_torch_compile: bool = True
39
-
40
- def forward(self, x: torch.Tensor):
41
- if not x.is_contiguous():
42
- x = x.contiguous()
43
- out = torch.empty_like(x)
44
- ops.silu(out, x)
45
- return out
46
-
47
- class Gelu(nn.Module):
48
- """An activation function for GELU.
49
-
50
- The function computes x -> gelu(x).
51
-
52
- Shapes:
53
- x: (num_tokens, d) or (batch_size, seq_len, d)
54
- return: (num_tokens, d) or (batch_size, seq_len, d)
55
- """
56
-
57
- can_torch_compile: bool = True
58
-
59
- def forward(self, x: torch.Tensor):
60
- if not x.is_contiguous():
61
- x = x.contiguous()
62
- out = torch.empty_like(x)
63
- ops.gelu(out, x)
64
- return out
65
-
66
- class GeluTanh(nn.Module):
67
- """An activation function for GELU with `tanh` approximation.
68
-
69
- The function computes x -> gelu_tanh(x).
70
-
71
- Shapes:
72
- x: (num_tokens, d) or (batch_size, seq_len, d)
73
- return: (num_tokens, d) or (batch_size, seq_len, d)
74
- """
75
-
76
- can_torch_compile: bool = True
77
-
78
- def forward(self, x: torch.Tensor):
79
- if not x.is_contiguous():
80
- x = x.contiguous()
81
- out = torch.empty_like(x)
82
- ops.gelu_tanh(out, x)
83
- return out
84
-
85
-
86
- class MulAndSilu(nn.Module):
87
- """An activation function for SwiGLU.
88
-
89
- The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
90
-
91
- Shapes:
92
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
93
- return: (num_tokens, d) or (batch_size, seq_len, d)
94
- """
95
-
96
- can_torch_compile: bool = True
97
-
98
- def forward(self, x: torch.Tensor) -> torch.Tensor:
99
- if not x.is_contiguous():
100
- x = x.contiguous()
101
- d = x.shape[-1] // 2
102
- output_shape = x.shape[:-1] + (d,)
103
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
104
- ops.mul_and_silu(out, x)
105
- return out
106
-
107
-
108
- class GeluAndMul(nn.Module):
109
- """An activation function for GeGLU.
110
-
111
- The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
112
-
113
- Shapes:
114
- x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
115
- return: (batch_size, seq_len, d) or (num_tokens, d)
116
- """
117
-
118
- can_torch_compile: bool = True
119
-
120
- def forward(self, x: torch.Tensor):
121
- if not x.is_contiguous():
122
- x = x.contiguous()
123
- d = x.shape[-1] // 2
124
- output_shape = x.shape[:-1] + (d,)
125
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
126
- ops.gelu_and_mul(out, x)
127
- return out
128
-
129
-
130
- class GeluTanhAndMul(nn.Module):
131
- can_torch_compile: bool = True
132
-
133
- def forward(self, x: torch.Tensor):
134
- if not x.is_contiguous():
135
- x = x.contiguous()
136
- d = x.shape[-1] // 2
137
- output_shape = x.shape[:-1] + (d,)
138
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
139
- ops.gelu_tanh_and_mul(out, x)
140
- return out
141
-
142
-
143
- class FatreluAndMul(nn.Module):
144
- """An activation function for FATReLU.
145
-
146
- The function computes x -> FATReLU(x[:d]) * x[d:] where
147
- d = x.shape[-1] // 2.
148
- This is used in openbmb/MiniCPM-S-1B-sft.
149
-
150
- Shapes:
151
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
152
- return: (num_tokens, d) or (batch_size, seq_len, d)
153
- """
154
-
155
- can_torch_compile: bool = True
156
-
157
- def __init__(self, threshold: float = 0.0):
158
- super().__init__()
159
- self.threshold = threshold
160
-
161
- def forward(self, x: torch.Tensor):
162
- if not x.is_contiguous():
163
- x = x.contiguous()
164
- d = x.shape[-1] // 2
165
- output_shape = x.shape[:-1] + (d,)
166
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
167
- ops.fatrelu_and_mul(out, x, self.threshold)
168
- return out
169
-
170
-
171
- class FastGELU(nn.Module):
172
- can_torch_compile: bool = True
173
-
174
- def forward(self, x: torch.Tensor) -> torch.Tensor:
175
- if not x.is_contiguous():
176
- x = x.contiguous()
177
- out = torch.empty_like(x)
178
- ops.gelu_fast(out, x)
179
- return out
180
-
181
-
182
- class NewGELU(nn.Module):
183
- can_torch_compile: bool = True
184
-
185
- def forward(self, x: torch.Tensor) -> torch.Tensor:
186
- if not x.is_contiguous():
187
- x = x.contiguous()
188
- out = torch.empty_like(x)
189
- ops.gelu_new(out, x)
190
- return out
191
-
192
-
193
- class QuickGELU(nn.Module):
194
- can_torch_compile: bool = True
195
-
196
- def forward(self, x: torch.Tensor) -> torch.Tensor:
197
- if not x.is_contiguous():
198
- x = x.contiguous()
199
- out = torch.empty_like(x)
200
- ops.gelu_quick(out, x)
201
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu130-aarch64-linux/metadata.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "version": 1,
3
- "license": "Apache-2.0",
4
- "python-depends": [],
5
- "backend": {
6
- "type": "cuda",
7
- "archs": [
8
- "10.0",
9
- "11.0",
10
- "12.0+PTX",
11
- "7.5",
12
- "8.0",
13
- "8.6",
14
- "8.7",
15
- "8.9",
16
- "9.0"
17
- ]
18
- }
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu130-x86_64-linux/__init__.py DELETED
@@ -1,75 +0,0 @@
1
- import torch
2
-
3
- from ._ops import ops
4
-
5
- from . import layers
6
-
7
-
8
- def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
9
- ops.silu_and_mul(out, x)
10
- return out
11
-
12
-
13
- def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
14
- ops.mul_and_silu(out, x)
15
- return out
16
-
17
-
18
- def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
19
- ops.gelu_and_mul(out, x)
20
- return out
21
-
22
-
23
- def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
24
- ops.gelu_tanh_and_mul(out, x)
25
- return out
26
-
27
-
28
- def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
29
- ops.fatrelu_and_mul(out, x, threshold)
30
- return out
31
-
32
-
33
- def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
34
- ops.gelu(out, x)
35
- return out
36
-
37
- def silu(out: torch.Tensor, x: torch.Tensor) -> None:
38
- ops.silu(out, x)
39
- return out
40
-
41
-
42
- def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
43
- ops.gelu_tanh(out, x)
44
- return out
45
-
46
-
47
- def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
48
- ops.gelu_fast(out, x)
49
- return out
50
-
51
-
52
- def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
53
- ops.gelu_new(out, x)
54
- return out
55
-
56
-
57
- def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
58
- ops.gelu_quick(out, x)
59
- return out
60
-
61
-
62
- __all__ = [
63
- "silu_and_mul",
64
- "mul_and_silu",
65
- "gelu_and_mul",
66
- "gelu_tanh_and_mul",
67
- "fatrelu_and_mul",
68
- "gelu_fast",
69
- "gelu_new",
70
- "gelu_quick",
71
- "gelu_tanh",
72
- "silu",
73
- "gelu",
74
- "layers",
75
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu130-x86_64-linux/_activation_cuda_ccf9ce9.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:57ee9bcb4b40ac6b5fa006806f222b3961ea49513e425b4b3caaf60c760d604d
3
- size 4186584
 
 
 
 
build/torch210-cxx11-cu130-x86_64-linux/_ops.py DELETED
@@ -1,9 +0,0 @@
1
- import torch
2
- from . import _activation_cuda_ccf9ce9
3
- ops = torch.ops._activation_cuda_ccf9ce9
4
-
5
- def add_op_namespace_prefix(op_name: str):
6
- """
7
- Prefix op by namespace.
8
- """
9
- return f"_activation_cuda_ccf9ce9::{op_name}"
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu130-x86_64-linux/activation/__init__.py DELETED
@@ -1,26 +0,0 @@
1
- import ctypes
2
- import importlib.util
3
- import sys
4
- from pathlib import Path
5
- from types import ModuleType
6
-
7
-
8
- def _import_from_path(file_path: Path) -> ModuleType:
9
- # We cannot use the module name as-is, after adding it to `sys.modules`,
10
- # it would also be used for other imports. So, we make a module name that
11
- # depends on the path for it to be unique using the hex-encoded hash of
12
- # the path.
13
- path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
- module_name = path_hash
15
- spec = importlib.util.spec_from_file_location(module_name, file_path)
16
- if spec is None:
17
- raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
- module = importlib.util.module_from_spec(spec)
19
- if module is None:
20
- raise ImportError(f"Cannot load module {module_name} from spec")
21
- sys.modules[module_name] = module
22
- spec.loader.exec_module(module) # type: ignore
23
- return module
24
-
25
-
26
- globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu130-x86_64-linux/layers.py DELETED
@@ -1,201 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
-
4
- from ._ops import ops
5
-
6
-
7
- class SiluAndMul(nn.Module):
8
- """An activation function for SwiGLU.
9
-
10
- The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
11
-
12
- Shapes:
13
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
14
- return: (num_tokens, d) or (batch_size, seq_len, d)
15
- """
16
-
17
- can_torch_compile: bool = True
18
-
19
- def forward(self, x: torch.Tensor):
20
- if not x.is_contiguous():
21
- x = x.contiguous()
22
- d = x.shape[-1] // 2
23
- output_shape = x.shape[:-1] + (d,)
24
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
25
- ops.silu_and_mul(out, x)
26
- return out
27
-
28
- class Silu(nn.Module):
29
- """An activation function for SiLU.
30
-
31
- The function computes x -> silu(x).
32
-
33
- Shapes:
34
- x: (num_tokens, d) or (batch_size, seq_len, d)
35
- return: (num_tokens, d) or (batch_size, seq_len, d)
36
- """
37
-
38
- can_torch_compile: bool = True
39
-
40
- def forward(self, x: torch.Tensor):
41
- if not x.is_contiguous():
42
- x = x.contiguous()
43
- out = torch.empty_like(x)
44
- ops.silu(out, x)
45
- return out
46
-
47
- class Gelu(nn.Module):
48
- """An activation function for GELU.
49
-
50
- The function computes x -> gelu(x).
51
-
52
- Shapes:
53
- x: (num_tokens, d) or (batch_size, seq_len, d)
54
- return: (num_tokens, d) or (batch_size, seq_len, d)
55
- """
56
-
57
- can_torch_compile: bool = True
58
-
59
- def forward(self, x: torch.Tensor):
60
- if not x.is_contiguous():
61
- x = x.contiguous()
62
- out = torch.empty_like(x)
63
- ops.gelu(out, x)
64
- return out
65
-
66
- class GeluTanh(nn.Module):
67
- """An activation function for GELU with `tanh` approximation.
68
-
69
- The function computes x -> gelu_tanh(x).
70
-
71
- Shapes:
72
- x: (num_tokens, d) or (batch_size, seq_len, d)
73
- return: (num_tokens, d) or (batch_size, seq_len, d)
74
- """
75
-
76
- can_torch_compile: bool = True
77
-
78
- def forward(self, x: torch.Tensor):
79
- if not x.is_contiguous():
80
- x = x.contiguous()
81
- out = torch.empty_like(x)
82
- ops.gelu_tanh(out, x)
83
- return out
84
-
85
-
86
- class MulAndSilu(nn.Module):
87
- """An activation function for SwiGLU.
88
-
89
- The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
90
-
91
- Shapes:
92
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
93
- return: (num_tokens, d) or (batch_size, seq_len, d)
94
- """
95
-
96
- can_torch_compile: bool = True
97
-
98
- def forward(self, x: torch.Tensor) -> torch.Tensor:
99
- if not x.is_contiguous():
100
- x = x.contiguous()
101
- d = x.shape[-1] // 2
102
- output_shape = x.shape[:-1] + (d,)
103
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
104
- ops.mul_and_silu(out, x)
105
- return out
106
-
107
-
108
- class GeluAndMul(nn.Module):
109
- """An activation function for GeGLU.
110
-
111
- The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
112
-
113
- Shapes:
114
- x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
115
- return: (batch_size, seq_len, d) or (num_tokens, d)
116
- """
117
-
118
- can_torch_compile: bool = True
119
-
120
- def forward(self, x: torch.Tensor):
121
- if not x.is_contiguous():
122
- x = x.contiguous()
123
- d = x.shape[-1] // 2
124
- output_shape = x.shape[:-1] + (d,)
125
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
126
- ops.gelu_and_mul(out, x)
127
- return out
128
-
129
-
130
- class GeluTanhAndMul(nn.Module):
131
- can_torch_compile: bool = True
132
-
133
- def forward(self, x: torch.Tensor):
134
- if not x.is_contiguous():
135
- x = x.contiguous()
136
- d = x.shape[-1] // 2
137
- output_shape = x.shape[:-1] + (d,)
138
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
139
- ops.gelu_tanh_and_mul(out, x)
140
- return out
141
-
142
-
143
- class FatreluAndMul(nn.Module):
144
- """An activation function for FATReLU.
145
-
146
- The function computes x -> FATReLU(x[:d]) * x[d:] where
147
- d = x.shape[-1] // 2.
148
- This is used in openbmb/MiniCPM-S-1B-sft.
149
-
150
- Shapes:
151
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
152
- return: (num_tokens, d) or (batch_size, seq_len, d)
153
- """
154
-
155
- can_torch_compile: bool = True
156
-
157
- def __init__(self, threshold: float = 0.0):
158
- super().__init__()
159
- self.threshold = threshold
160
-
161
- def forward(self, x: torch.Tensor):
162
- if not x.is_contiguous():
163
- x = x.contiguous()
164
- d = x.shape[-1] // 2
165
- output_shape = x.shape[:-1] + (d,)
166
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
167
- ops.fatrelu_and_mul(out, x, self.threshold)
168
- return out
169
-
170
-
171
- class FastGELU(nn.Module):
172
- can_torch_compile: bool = True
173
-
174
- def forward(self, x: torch.Tensor) -> torch.Tensor:
175
- if not x.is_contiguous():
176
- x = x.contiguous()
177
- out = torch.empty_like(x)
178
- ops.gelu_fast(out, x)
179
- return out
180
-
181
-
182
- class NewGELU(nn.Module):
183
- can_torch_compile: bool = True
184
-
185
- def forward(self, x: torch.Tensor) -> torch.Tensor:
186
- if not x.is_contiguous():
187
- x = x.contiguous()
188
- out = torch.empty_like(x)
189
- ops.gelu_new(out, x)
190
- return out
191
-
192
-
193
- class QuickGELU(nn.Module):
194
- can_torch_compile: bool = True
195
-
196
- def forward(self, x: torch.Tensor) -> torch.Tensor:
197
- if not x.is_contiguous():
198
- x = x.contiguous()
199
- out = torch.empty_like(x)
200
- ops.gelu_quick(out, x)
201
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-cxx11-cu130-x86_64-linux/metadata.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "version": 1,
3
- "license": "Apache-2.0",
4
- "python-depends": [],
5
- "backend": {
6
- "type": "cuda",
7
- "archs": [
8
- "10.0",
9
- "11.0",
10
- "12.0+PTX",
11
- "7.5",
12
- "8.0",
13
- "8.6",
14
- "8.7",
15
- "8.9",
16
- "9.0"
17
- ]
18
- }
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-metal-aarch64-darwin/__init__.py DELETED
@@ -1,75 +0,0 @@
1
- import torch
2
-
3
- from ._ops import ops
4
-
5
- from . import layers
6
-
7
-
8
- def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
9
- ops.silu_and_mul(out, x)
10
- return out
11
-
12
-
13
- def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
14
- ops.mul_and_silu(out, x)
15
- return out
16
-
17
-
18
- def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
19
- ops.gelu_and_mul(out, x)
20
- return out
21
-
22
-
23
- def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
24
- ops.gelu_tanh_and_mul(out, x)
25
- return out
26
-
27
-
28
- def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
29
- ops.fatrelu_and_mul(out, x, threshold)
30
- return out
31
-
32
-
33
- def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
34
- ops.gelu(out, x)
35
- return out
36
-
37
- def silu(out: torch.Tensor, x: torch.Tensor) -> None:
38
- ops.silu(out, x)
39
- return out
40
-
41
-
42
- def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
43
- ops.gelu_tanh(out, x)
44
- return out
45
-
46
-
47
- def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
48
- ops.gelu_fast(out, x)
49
- return out
50
-
51
-
52
- def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
53
- ops.gelu_new(out, x)
54
- return out
55
-
56
-
57
- def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
58
- ops.gelu_quick(out, x)
59
- return out
60
-
61
-
62
- __all__ = [
63
- "silu_and_mul",
64
- "mul_and_silu",
65
- "gelu_and_mul",
66
- "gelu_tanh_and_mul",
67
- "fatrelu_and_mul",
68
- "gelu_fast",
69
- "gelu_new",
70
- "gelu_quick",
71
- "gelu_tanh",
72
- "silu",
73
- "gelu",
74
- "layers",
75
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-metal-aarch64-darwin/_activation_metal_ccf9ce9.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:83cc62ee4c6546a6b77f4ad26285861bd36894ddacb4836e38db664e75c317f3
3
- size 205056
 
 
 
 
build/torch210-metal-aarch64-darwin/_ops.py DELETED
@@ -1,9 +0,0 @@
1
- import torch
2
- from . import _activation_metal_ccf9ce9
3
- ops = torch.ops._activation_metal_ccf9ce9
4
-
5
- def add_op_namespace_prefix(op_name: str):
6
- """
7
- Prefix op by namespace.
8
- """
9
- return f"_activation_metal_ccf9ce9::{op_name}"
 
 
 
 
 
 
 
 
 
 
build/torch210-metal-aarch64-darwin/activation/__init__.py DELETED
@@ -1,26 +0,0 @@
1
- import ctypes
2
- import importlib.util
3
- import sys
4
- from pathlib import Path
5
- from types import ModuleType
6
-
7
-
8
- def _import_from_path(file_path: Path) -> ModuleType:
9
- # We cannot use the module name as-is, after adding it to `sys.modules`,
10
- # it would also be used for other imports. So, we make a module name that
11
- # depends on the path for it to be unique using the hex-encoded hash of
12
- # the path.
13
- path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
- module_name = path_hash
15
- spec = importlib.util.spec_from_file_location(module_name, file_path)
16
- if spec is None:
17
- raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
- module = importlib.util.module_from_spec(spec)
19
- if module is None:
20
- raise ImportError(f"Cannot load module {module_name} from spec")
21
- sys.modules[module_name] = module
22
- spec.loader.exec_module(module) # type: ignore
23
- return module
24
-
25
-
26
- globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-metal-aarch64-darwin/layers.py DELETED
@@ -1,201 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
-
4
- from ._ops import ops
5
-
6
-
7
- class SiluAndMul(nn.Module):
8
- """An activation function for SwiGLU.
9
-
10
- The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
11
-
12
- Shapes:
13
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
14
- return: (num_tokens, d) or (batch_size, seq_len, d)
15
- """
16
-
17
- can_torch_compile: bool = True
18
-
19
- def forward(self, x: torch.Tensor):
20
- if not x.is_contiguous():
21
- x = x.contiguous()
22
- d = x.shape[-1] // 2
23
- output_shape = x.shape[:-1] + (d,)
24
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
25
- ops.silu_and_mul(out, x)
26
- return out
27
-
28
- class Silu(nn.Module):
29
- """An activation function for SiLU.
30
-
31
- The function computes x -> silu(x).
32
-
33
- Shapes:
34
- x: (num_tokens, d) or (batch_size, seq_len, d)
35
- return: (num_tokens, d) or (batch_size, seq_len, d)
36
- """
37
-
38
- can_torch_compile: bool = True
39
-
40
- def forward(self, x: torch.Tensor):
41
- if not x.is_contiguous():
42
- x = x.contiguous()
43
- out = torch.empty_like(x)
44
- ops.silu(out, x)
45
- return out
46
-
47
- class Gelu(nn.Module):
48
- """An activation function for GELU.
49
-
50
- The function computes x -> gelu(x).
51
-
52
- Shapes:
53
- x: (num_tokens, d) or (batch_size, seq_len, d)
54
- return: (num_tokens, d) or (batch_size, seq_len, d)
55
- """
56
-
57
- can_torch_compile: bool = True
58
-
59
- def forward(self, x: torch.Tensor):
60
- if not x.is_contiguous():
61
- x = x.contiguous()
62
- out = torch.empty_like(x)
63
- ops.gelu(out, x)
64
- return out
65
-
66
- class GeluTanh(nn.Module):
67
- """An activation function for GELU with `tanh` approximation.
68
-
69
- The function computes x -> gelu_tanh(x).
70
-
71
- Shapes:
72
- x: (num_tokens, d) or (batch_size, seq_len, d)
73
- return: (num_tokens, d) or (batch_size, seq_len, d)
74
- """
75
-
76
- can_torch_compile: bool = True
77
-
78
- def forward(self, x: torch.Tensor):
79
- if not x.is_contiguous():
80
- x = x.contiguous()
81
- out = torch.empty_like(x)
82
- ops.gelu_tanh(out, x)
83
- return out
84
-
85
-
86
- class MulAndSilu(nn.Module):
87
- """An activation function for SwiGLU.
88
-
89
- The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
90
-
91
- Shapes:
92
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
93
- return: (num_tokens, d) or (batch_size, seq_len, d)
94
- """
95
-
96
- can_torch_compile: bool = True
97
-
98
- def forward(self, x: torch.Tensor) -> torch.Tensor:
99
- if not x.is_contiguous():
100
- x = x.contiguous()
101
- d = x.shape[-1] // 2
102
- output_shape = x.shape[:-1] + (d,)
103
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
104
- ops.mul_and_silu(out, x)
105
- return out
106
-
107
-
108
- class GeluAndMul(nn.Module):
109
- """An activation function for GeGLU.
110
-
111
- The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
112
-
113
- Shapes:
114
- x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
115
- return: (batch_size, seq_len, d) or (num_tokens, d)
116
- """
117
-
118
- can_torch_compile: bool = True
119
-
120
- def forward(self, x: torch.Tensor):
121
- if not x.is_contiguous():
122
- x = x.contiguous()
123
- d = x.shape[-1] // 2
124
- output_shape = x.shape[:-1] + (d,)
125
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
126
- ops.gelu_and_mul(out, x)
127
- return out
128
-
129
-
130
- class GeluTanhAndMul(nn.Module):
131
- can_torch_compile: bool = True
132
-
133
- def forward(self, x: torch.Tensor):
134
- if not x.is_contiguous():
135
- x = x.contiguous()
136
- d = x.shape[-1] // 2
137
- output_shape = x.shape[:-1] + (d,)
138
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
139
- ops.gelu_tanh_and_mul(out, x)
140
- return out
141
-
142
-
143
- class FatreluAndMul(nn.Module):
144
- """An activation function for FATReLU.
145
-
146
- The function computes x -> FATReLU(x[:d]) * x[d:] where
147
- d = x.shape[-1] // 2.
148
- This is used in openbmb/MiniCPM-S-1B-sft.
149
-
150
- Shapes:
151
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
152
- return: (num_tokens, d) or (batch_size, seq_len, d)
153
- """
154
-
155
- can_torch_compile: bool = True
156
-
157
- def __init__(self, threshold: float = 0.0):
158
- super().__init__()
159
- self.threshold = threshold
160
-
161
- def forward(self, x: torch.Tensor):
162
- if not x.is_contiguous():
163
- x = x.contiguous()
164
- d = x.shape[-1] // 2
165
- output_shape = x.shape[:-1] + (d,)
166
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
167
- ops.fatrelu_and_mul(out, x, self.threshold)
168
- return out
169
-
170
-
171
- class FastGELU(nn.Module):
172
- can_torch_compile: bool = True
173
-
174
- def forward(self, x: torch.Tensor) -> torch.Tensor:
175
- if not x.is_contiguous():
176
- x = x.contiguous()
177
- out = torch.empty_like(x)
178
- ops.gelu_fast(out, x)
179
- return out
180
-
181
-
182
- class NewGELU(nn.Module):
183
- can_torch_compile: bool = True
184
-
185
- def forward(self, x: torch.Tensor) -> torch.Tensor:
186
- if not x.is_contiguous():
187
- x = x.contiguous()
188
- out = torch.empty_like(x)
189
- ops.gelu_new(out, x)
190
- return out
191
-
192
-
193
- class QuickGELU(nn.Module):
194
- can_torch_compile: bool = True
195
-
196
- def forward(self, x: torch.Tensor) -> torch.Tensor:
197
- if not x.is_contiguous():
198
- x = x.contiguous()
199
- out = torch.empty_like(x)
200
- ops.gelu_quick(out, x)
201
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch210-metal-aarch64-darwin/metadata.json DELETED
@@ -1,8 +0,0 @@
1
- {
2
- "version": 1,
3
- "license": "Apache-2.0",
4
- "python-depends": [],
5
- "backend": {
6
- "type": "metal"
7
- }
8
- }
 
 
 
 
 
 
 
 
 
build/torch211-cu128-x86_64-windows/__init__.py DELETED
@@ -1,75 +0,0 @@
1
- import torch
2
-
3
- from ._ops import ops
4
-
5
- from . import layers
6
-
7
-
8
- def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
9
- ops.silu_and_mul(out, x)
10
- return out
11
-
12
-
13
- def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
14
- ops.mul_and_silu(out, x)
15
- return out
16
-
17
-
18
- def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
19
- ops.gelu_and_mul(out, x)
20
- return out
21
-
22
-
23
- def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
24
- ops.gelu_tanh_and_mul(out, x)
25
- return out
26
-
27
-
28
- def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
29
- ops.fatrelu_and_mul(out, x, threshold)
30
- return out
31
-
32
-
33
- def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
34
- ops.gelu(out, x)
35
- return out
36
-
37
- def silu(out: torch.Tensor, x: torch.Tensor) -> None:
38
- ops.silu(out, x)
39
- return out
40
-
41
-
42
- def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
43
- ops.gelu_tanh(out, x)
44
- return out
45
-
46
-
47
- def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
48
- ops.gelu_fast(out, x)
49
- return out
50
-
51
-
52
- def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
53
- ops.gelu_new(out, x)
54
- return out
55
-
56
-
57
- def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
58
- ops.gelu_quick(out, x)
59
- return out
60
-
61
-
62
- __all__ = [
63
- "silu_and_mul",
64
- "mul_and_silu",
65
- "gelu_and_mul",
66
- "gelu_tanh_and_mul",
67
- "fatrelu_and_mul",
68
- "gelu_fast",
69
- "gelu_new",
70
- "gelu_quick",
71
- "gelu_tanh",
72
- "silu",
73
- "gelu",
74
- "layers",
75
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch211-cu128-x86_64-windows/_activation_cuda_ce29f4e.pyd DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3b882f56a5eaffdb32e94f924b611a4df6ee97400021b2dc34be9a062b941a7
3
- size 2464256
 
 
 
 
build/torch211-cu128-x86_64-windows/_ops.py DELETED
@@ -1,9 +0,0 @@
1
- import torch
2
- from . import _activation_cuda_ce29f4e
3
- ops = torch.ops._activation_cuda_ce29f4e
4
-
5
- def add_op_namespace_prefix(op_name: str):
6
- """
7
- Prefix op by namespace.
8
- """
9
- return f"_activation_cuda_ce29f4e::{op_name}"
 
 
 
 
 
 
 
 
 
 
build/torch211-cu128-x86_64-windows/activation/__init__.py DELETED
@@ -1,26 +0,0 @@
1
- import ctypes
2
- import importlib.util
3
- import sys
4
- from pathlib import Path
5
- from types import ModuleType
6
-
7
-
8
- def _import_from_path(file_path: Path) -> ModuleType:
9
- # We cannot use the module name as-is, after adding it to `sys.modules`,
10
- # it would also be used for other imports. So, we make a module name that
11
- # depends on the path for it to be unique using the hex-encoded hash of
12
- # the path.
13
- path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
- module_name = path_hash
15
- spec = importlib.util.spec_from_file_location(module_name, file_path)
16
- if spec is None:
17
- raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
- module = importlib.util.module_from_spec(spec)
19
- if module is None:
20
- raise ImportError(f"Cannot load module {module_name} from spec")
21
- sys.modules[module_name] = module
22
- spec.loader.exec_module(module) # type: ignore
23
- return module
24
-
25
-
26
- globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch211-cu128-x86_64-windows/layers.py DELETED
@@ -1,201 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
-
4
- from ._ops import ops
5
-
6
-
7
- class SiluAndMul(nn.Module):
8
- """An activation function for SwiGLU.
9
-
10
- The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
11
-
12
- Shapes:
13
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
14
- return: (num_tokens, d) or (batch_size, seq_len, d)
15
- """
16
-
17
- can_torch_compile: bool = True
18
-
19
- def forward(self, x: torch.Tensor):
20
- if not x.is_contiguous():
21
- x = x.contiguous()
22
- d = x.shape[-1] // 2
23
- output_shape = x.shape[:-1] + (d,)
24
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
25
- ops.silu_and_mul(out, x)
26
- return out
27
-
28
- class Silu(nn.Module):
29
- """An activation function for SiLU.
30
-
31
- The function computes x -> silu(x).
32
-
33
- Shapes:
34
- x: (num_tokens, d) or (batch_size, seq_len, d)
35
- return: (num_tokens, d) or (batch_size, seq_len, d)
36
- """
37
-
38
- can_torch_compile: bool = True
39
-
40
- def forward(self, x: torch.Tensor):
41
- if not x.is_contiguous():
42
- x = x.contiguous()
43
- out = torch.empty_like(x)
44
- ops.silu(out, x)
45
- return out
46
-
47
- class Gelu(nn.Module):
48
- """An activation function for GELU.
49
-
50
- The function computes x -> gelu(x).
51
-
52
- Shapes:
53
- x: (num_tokens, d) or (batch_size, seq_len, d)
54
- return: (num_tokens, d) or (batch_size, seq_len, d)
55
- """
56
-
57
- can_torch_compile: bool = True
58
-
59
- def forward(self, x: torch.Tensor):
60
- if not x.is_contiguous():
61
- x = x.contiguous()
62
- out = torch.empty_like(x)
63
- ops.gelu(out, x)
64
- return out
65
-
66
- class GeluTanh(nn.Module):
67
- """An activation function for GELU with `tanh` approximation.
68
-
69
- The function computes x -> gelu_tanh(x).
70
-
71
- Shapes:
72
- x: (num_tokens, d) or (batch_size, seq_len, d)
73
- return: (num_tokens, d) or (batch_size, seq_len, d)
74
- """
75
-
76
- can_torch_compile: bool = True
77
-
78
- def forward(self, x: torch.Tensor):
79
- if not x.is_contiguous():
80
- x = x.contiguous()
81
- out = torch.empty_like(x)
82
- ops.gelu_tanh(out, x)
83
- return out
84
-
85
-
86
- class MulAndSilu(nn.Module):
87
- """An activation function for SwiGLU.
88
-
89
- The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
90
-
91
- Shapes:
92
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
93
- return: (num_tokens, d) or (batch_size, seq_len, d)
94
- """
95
-
96
- can_torch_compile: bool = True
97
-
98
- def forward(self, x: torch.Tensor) -> torch.Tensor:
99
- if not x.is_contiguous():
100
- x = x.contiguous()
101
- d = x.shape[-1] // 2
102
- output_shape = x.shape[:-1] + (d,)
103
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
104
- ops.mul_and_silu(out, x)
105
- return out
106
-
107
-
108
- class GeluAndMul(nn.Module):
109
- """An activation function for GeGLU.
110
-
111
- The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
112
-
113
- Shapes:
114
- x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
115
- return: (batch_size, seq_len, d) or (num_tokens, d)
116
- """
117
-
118
- can_torch_compile: bool = True
119
-
120
- def forward(self, x: torch.Tensor):
121
- if not x.is_contiguous():
122
- x = x.contiguous()
123
- d = x.shape[-1] // 2
124
- output_shape = x.shape[:-1] + (d,)
125
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
126
- ops.gelu_and_mul(out, x)
127
- return out
128
-
129
-
130
- class GeluTanhAndMul(nn.Module):
131
- can_torch_compile: bool = True
132
-
133
- def forward(self, x: torch.Tensor):
134
- if not x.is_contiguous():
135
- x = x.contiguous()
136
- d = x.shape[-1] // 2
137
- output_shape = x.shape[:-1] + (d,)
138
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
139
- ops.gelu_tanh_and_mul(out, x)
140
- return out
141
-
142
-
143
- class FatreluAndMul(nn.Module):
144
- """An activation function for FATReLU.
145
-
146
- The function computes x -> FATReLU(x[:d]) * x[d:] where
147
- d = x.shape[-1] // 2.
148
- This is used in openbmb/MiniCPM-S-1B-sft.
149
-
150
- Shapes:
151
- x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
152
- return: (num_tokens, d) or (batch_size, seq_len, d)
153
- """
154
-
155
- can_torch_compile: bool = True
156
-
157
- def __init__(self, threshold: float = 0.0):
158
- super().__init__()
159
- self.threshold = threshold
160
-
161
- def forward(self, x: torch.Tensor):
162
- if not x.is_contiguous():
163
- x = x.contiguous()
164
- d = x.shape[-1] // 2
165
- output_shape = x.shape[:-1] + (d,)
166
- out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
167
- ops.fatrelu_and_mul(out, x, self.threshold)
168
- return out
169
-
170
-
171
- class FastGELU(nn.Module):
172
- can_torch_compile: bool = True
173
-
174
- def forward(self, x: torch.Tensor) -> torch.Tensor:
175
- if not x.is_contiguous():
176
- x = x.contiguous()
177
- out = torch.empty_like(x)
178
- ops.gelu_fast(out, x)
179
- return out
180
-
181
-
182
- class NewGELU(nn.Module):
183
- can_torch_compile: bool = True
184
-
185
- def forward(self, x: torch.Tensor) -> torch.Tensor:
186
- if not x.is_contiguous():
187
- x = x.contiguous()
188
- out = torch.empty_like(x)
189
- ops.gelu_new(out, x)
190
- return out
191
-
192
-
193
- class QuickGELU(nn.Module):
194
- can_torch_compile: bool = True
195
-
196
- def forward(self, x: torch.Tensor) -> torch.Tensor:
197
- if not x.is_contiguous():
198
- x = x.contiguous()
199
- out = torch.empty_like(x)
200
- ops.gelu_quick(out, x)
201
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch211-cu128-x86_64-windows/metadata.json DELETED
@@ -1,21 +0,0 @@
1
- {
2
- "version": 1,
3
- "license": "Apache-2.0",
4
- "python-depends": [],
5
- "backend": {
6
- "type": "cuda",
7
- "archs": [
8
- "10.0",
9
- "10.1",
10
- "12.0+PTX",
11
- "7.0",
12
- "7.2",
13
- "7.5",
14
- "8.0",
15
- "8.6",
16
- "8.7",
17
- "8.9",
18
- "9.0"
19
- ]
20
- }
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch211-cxx11-cu126-aarch64-linux/__init__.py DELETED
@@ -1,75 +0,0 @@
1
- import torch
2
-
3
- from ._ops import ops
4
-
5
- from . import layers
6
-
7
-
8
- def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
9
- ops.silu_and_mul(out, x)
10
- return out
11
-
12
-
13
- def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
14
- ops.mul_and_silu(out, x)
15
- return out
16
-
17
-
18
- def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
19
- ops.gelu_and_mul(out, x)
20
- return out
21
-
22
-
23
- def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
24
- ops.gelu_tanh_and_mul(out, x)
25
- return out
26
-
27
-
28
- def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
29
- ops.fatrelu_and_mul(out, x, threshold)
30
- return out
31
-
32
-
33
- def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
34
- ops.gelu(out, x)
35
- return out
36
-
37
- def silu(out: torch.Tensor, x: torch.Tensor) -> None:
38
- ops.silu(out, x)
39
- return out
40
-
41
-
42
- def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
43
- ops.gelu_tanh(out, x)
44
- return out
45
-
46
-
47
- def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
48
- ops.gelu_fast(out, x)
49
- return out
50
-
51
-
52
- def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
53
- ops.gelu_new(out, x)
54
- return out
55
-
56
-
57
- def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
58
- ops.gelu_quick(out, x)
59
- return out
60
-
61
-
62
- __all__ = [
63
- "silu_and_mul",
64
- "mul_and_silu",
65
- "gelu_and_mul",
66
- "gelu_tanh_and_mul",
67
- "fatrelu_and_mul",
68
- "gelu_fast",
69
- "gelu_new",
70
- "gelu_quick",
71
- "gelu_tanh",
72
- "silu",
73
- "gelu",
74
- "layers",
75
- ]