diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..d53cd9b27bef454d0290ae7bde6d6a1470b246b6
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,37 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cu128-x86_64-windows/activation/_activation_e1b4b08.pyd filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..06cb917ab1eb4e645c04cdb314aa41bce05391d6
--- /dev/null
+++ b/README.md
@@ -0,0 +1,13 @@
+---
+tags:
+  - kernels
+---
+
+![Status](https://hubwebhook.dholtz.com/shield?repo=kernels-community/activation)
+
+## Activation
+
+Activation kernels from [vLLM](https://github.com/vllm-project/vllm/blob/main/csrc/activation_kernels.cu).
+
+Kernel source: https://github.com/huggingface/kernels-community/tree/main/activation
+
diff --git a/benchmark.py b/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..aff9934dce70f0ba644482bc1408fcf27dbe5f01
--- /dev/null
+++ b/benchmark.py
@@ -0,0 +1,66 @@
+import torch
+from kernels.benchmark import Benchmark
+
+
+def setup_silu_tensors(self, num_tokens: int, hidden_dim: int, dtype=torch.float16):
+    self.x = torch.randn(num_tokens, 2 * hidden_dim, device="cuda", dtype=dtype)
+    self.out = torch.empty(num_tokens, hidden_dim, device="cuda", dtype=dtype)
+
+
+def verify_silu(self):
+    d = self.x.shape[-1] // 2
+    ref = torch.nn.functional.silu(self.x[..., :d]) * self.x[..., d:]
+    return torch.allclose(self.out, ref, atol=1e-3, rtol=1e-3)
+
+
+class SiluWorkloads(Benchmark):
+    kernel_id = "kernels-community/activation"
+    seed = 42
+    x: torch.Tensor  # kernel specific input var
+    out: torch.Tensor  # kernel specific output var
+
+    # Workload 1
+    def setup_small(self):
+        setup_silu_tensors(self, num_tokens=32, hidden_dim=256)
+
+    def benchmark_small(self):
+        self.kernel.silu_and_mul(self.out, self.x)  # type: ignore
+
+    def verify_small(self):
+        return verify_silu(self)
+
+    # Workload 2
+    def setup_medium(self):
+        setup_silu_tensors(self, num_tokens=1024, hidden_dim=2048)
+
+    def benchmark_medium(self):
+        self.kernel.silu_and_mul(self.out, self.x)  # type: ignore
+
+    def verify_medium(self):
+        return verify_silu(self)
+
+
+class SiluWorkloads2(Benchmark):
+    kernel_id = "kernels-community/activation"
+    seed = 42
+    x: torch.Tensor  # kernel specific input var
+    out: torch.Tensor  # kernel specific output var
+
+    # Workload 1
+    def setup_small(self):
+        setup_silu_tensors(self, num_tokens=32, hidden_dim=256)
+
+    def benchmark_small(self):
+        self.kernel.silu_and_mul(self.out, self.x)  # type: ignore
+
+    def verify_small(self):
+        return verify_silu(self)
+
+    # Workload 2
+    def setup_medium(self):
+        setup_silu_tensors(self, num_tokens=1024, hidden_dim=2048)
+
+    def benchmark_medium(self):
+        self.kernel.silu_and_mul(self.out, self.x)  # type: ignore
+
+    # Note: show case without a verify
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..0522bbfdbf8749d671faa8fe91b169dbd2cafa8d
--- /dev/null
+++ b/benchmarks/benchmark.py
@@ -0,0 +1,5 @@
+from kernels.benchmarks import SiluAndMulBenchmark
+
+
+class SiluWorkloads(SiluAndMulBenchmark):
+    pass
diff --git a/build/torch210-cu128-x86_64-windows/activation/__init__.py b/build/torch210-cu128-x86_64-windows/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..379e245ef7515d04bfe4e680e2549fcf8790cc15
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/activation/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch210-cu128-x86_64-windows/activation/_activation_e1b4b08.pyd b/build/torch210-cu128-x86_64-windows/activation/_activation_e1b4b08.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..ca0f54ee392befa7fa8a084bcc730e416a912f23
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/activation/_activation_e1b4b08.pyd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d741006dd4fe8a85ed461fa3727d4d9f1b438083d2f1075ae54650bbdd2dc179
+size 2463744
diff --git a/build/torch210-cu128-x86_64-windows/activation/_ops.py b/build/torch210-cu128-x86_64-windows/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..110a36d47839efd80d8d58e5cce311e50d684990
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_e1b4b08
+ops = torch.ops._activation_e1b4b08
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_e1b4b08::{op_name}"
\ No newline at end of file
diff --git a/build/torch210-cu128-x86_64-windows/activation/layers.py b/build/torch210-cu128-x86_64-windows/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dbfa19f89f2514b94e7b35d528a1e76ec4da7a3
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/activation/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch210-cu128-x86_64-windows/metadata.json b/build/torch210-cu128-x86_64-windows/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch210-cu128-x86_64-windows/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_activation_cuda_5e1630d.abi3.so b/build/torch210-cxx11-cu126-aarch64-linux/_activation_cuda_5e1630d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..5eb55ae25541a66de1833033597fe66562223b3b
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_activation_cuda_5e1630d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53cb1adecf7bb00650edb28c861b149c48729739ca1c2a6bae39fe52e22657bb
+size 3228128
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_ops.py b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8caf619763d118e067bb91d329c09e99f4a54a4
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_cuda_5e1630d
+ops = torch.ops._activation_cuda_5e1630d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_cuda_5e1630d::{op_name}"
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/activation/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/layers.py b/build/torch210-cxx11-cu126-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch210-cxx11-cu126-aarch64-linux/metadata.json b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5902b55ab0b2b561c0cf97567c9806c60839c7f
--- /dev/null
+++ b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0+PTX"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_activation_cuda_5e1630d.abi3.so b/build/torch210-cxx11-cu126-x86_64-linux/_activation_cuda_5e1630d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..cb33ce25de90d3d8af2e226331edccc94e8090c4
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_activation_cuda_5e1630d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8a0be3112850b924da942b3913629a7ab0681277b29b23e34bfd79e24d16b2f
+size 3126848
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_ops.py b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8caf619763d118e067bb91d329c09e99f4a54a4
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_cuda_5e1630d
+ops = torch.ops._activation_cuda_5e1630d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_cuda_5e1630d::{op_name}"
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/activation/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/layers.py b/build/torch210-cxx11-cu126-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch210-cxx11-cu126-x86_64-linux/metadata.json b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5902b55ab0b2b561c0cf97567c9806c60839c7f
--- /dev/null
+++ b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0+PTX"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_activation_cuda_5e1630d.abi3.so b/build/torch210-cxx11-cu128-aarch64-linux/_activation_cuda_5e1630d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..447c7cb31abe735ab16a86c75f3ac6f10115e4e0
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_activation_cuda_5e1630d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9eac4ae53a85546234a483c48c9bfcb717d16bb434b1f9723909fb838d366cb3
+size 4538960
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_ops.py b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8caf619763d118e067bb91d329c09e99f4a54a4
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_cuda_5e1630d
+ops = torch.ops._activation_cuda_5e1630d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_cuda_5e1630d::{op_name}"
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/activation/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/layers.py b/build/torch210-cxx11-cu128-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch210-cxx11-cu128-aarch64-linux/metadata.json b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b796af185fbbd8594fcd846949aa5fadc0ccdda
--- /dev/null
+++ b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_activation_cuda_5e1630d.abi3.so b/build/torch210-cxx11-cu128-x86_64-linux/_activation_cuda_5e1630d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..b2727c565b0e820e7e6151ee7e74fe9a3e84f6d8
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_activation_cuda_5e1630d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5d89134ddac2eee668ec060a853c99a6a3099a05b01e6a372cfa89b25c9a4d5
+size 4406632
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_ops.py b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8caf619763d118e067bb91d329c09e99f4a54a4
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_cuda_5e1630d
+ops = torch.ops._activation_cuda_5e1630d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_cuda_5e1630d::{op_name}"
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/activation/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/layers.py b/build/torch210-cxx11-cu128-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch210-cxx11-cu128-x86_64-linux/metadata.json b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b796af185fbbd8594fcd846949aa5fadc0ccdda
--- /dev/null
+++ b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_activation_cuda_5e1630d.abi3.so b/build/torch210-cxx11-cu130-aarch64-linux/_activation_cuda_5e1630d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..a7eb49e08987ab8f58ec64430a07df7b50784c73
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_activation_cuda_5e1630d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1be21db6774214a4e1b58290f71e2e5cd0146af7d0646220ed4c0873d959b7e2
+size 4293520
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_ops.py b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8caf619763d118e067bb91d329c09e99f4a54a4
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_cuda_5e1630d
+ops = torch.ops._activation_cuda_5e1630d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_cuda_5e1630d::{op_name}"
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/activation/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/layers.py b/build/torch210-cxx11-cu130-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch210-cxx11-cu130-aarch64-linux/metadata.json b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..66651b7d3f95ac9e5ce5fc2a641b6f0f50788f87
--- /dev/null
+++ b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,19 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_activation_cuda_5e1630d.abi3.so b/build/torch210-cxx11-cu130-x86_64-linux/_activation_cuda_5e1630d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..ac170933452130bbc40b403e8cc476811ccdc62d
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_activation_cuda_5e1630d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b7f7097fa67bb40a27a26e2a6bdeec262eb878307336e9fb350388899e09a89
+size 4190176
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_ops.py b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8caf619763d118e067bb91d329c09e99f4a54a4
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_cuda_5e1630d
+ops = torch.ops._activation_cuda_5e1630d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_cuda_5e1630d::{op_name}"
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/activation/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/layers.py b/build/torch210-cxx11-cu130-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch210-cxx11-cu130-x86_64-linux/metadata.json b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..66651b7d3f95ac9e5ce5fc2a641b6f0f50788f87
--- /dev/null
+++ b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,19 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch210-metal-aarch64-darwin/__init__.py b/build/torch210-metal-aarch64-darwin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch210-metal-aarch64-darwin/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch210-metal-aarch64-darwin/_activation_63b875f.abi3.so b/build/torch210-metal-aarch64-darwin/_activation_63b875f.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..ba3b331d8ec8fbebaa26c880f2be4824ae26de15
--- /dev/null
+++ b/build/torch210-metal-aarch64-darwin/_activation_63b875f.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40b08339eb57c5db3a676d69eafc6d1be7cf14e71e57a544289e8922ab7c118c
+size 221272
diff --git a/build/torch210-metal-aarch64-darwin/_ops.py b/build/torch210-metal-aarch64-darwin/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..602229319b5ec8bd38c2cd107da58e1e9e968b8d
--- /dev/null
+++ b/build/torch210-metal-aarch64-darwin/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_63b875f
+ops = torch.ops._activation_63b875f
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_63b875f::{op_name}"
\ No newline at end of file
diff --git a/build/torch210-metal-aarch64-darwin/activation/__init__.py b/build/torch210-metal-aarch64-darwin/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch210-metal-aarch64-darwin/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch210-metal-aarch64-darwin/layers.py b/build/torch210-metal-aarch64-darwin/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch210-metal-aarch64-darwin/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch210-metal-aarch64-darwin/metadata.json b/build/torch210-metal-aarch64-darwin/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch210-metal-aarch64-darwin/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_activation_cuda_5e1630d.abi3.so b/build/torch211-cxx11-cu126-aarch64-linux/_activation_cuda_5e1630d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..b327bf7b9437c45d66e36e22e870c43bd975c0ef
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_activation_cuda_5e1630d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7015eb787a6fbbc2142ae85a8e169de810f27e09650870845d317305fa668eda
+size 3224336
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_ops.py b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8caf619763d118e067bb91d329c09e99f4a54a4
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_cuda_5e1630d
+ops = torch.ops._activation_cuda_5e1630d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_cuda_5e1630d::{op_name}"
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/activation/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/layers.py b/build/torch211-cxx11-cu126-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch211-cxx11-cu126-aarch64-linux/metadata.json b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5902b55ab0b2b561c0cf97567c9806c60839c7f
--- /dev/null
+++ b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0+PTX"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_activation_cuda_5e1630d.abi3.so b/build/torch211-cxx11-cu126-x86_64-linux/_activation_cuda_5e1630d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..134342e0f66f86e950091ca47942e03d57012e47
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_activation_cuda_5e1630d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c56d80bd2bbfb93ad648f3ef81e414d62a41c7b28d5221f51c5659ba1dd316b0
+size 3119768
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_ops.py b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8caf619763d118e067bb91d329c09e99f4a54a4
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_cuda_5e1630d
+ops = torch.ops._activation_cuda_5e1630d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_cuda_5e1630d::{op_name}"
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/activation/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/layers.py b/build/torch211-cxx11-cu126-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch211-cxx11-cu126-x86_64-linux/metadata.json b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5902b55ab0b2b561c0cf97567c9806c60839c7f
--- /dev/null
+++ b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,18 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0+PTX"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_activation_cuda_5e1630d.abi3.so b/build/torch211-cxx11-cu128-aarch64-linux/_activation_cuda_5e1630d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..2745fbfab07edaa414c31c9253d18471b5379a88
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_activation_cuda_5e1630d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:074f2ccbabf93779cef6f1c81167dbc6076a6787e059354128f4c84993d17b6b
+size 4535168
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_ops.py b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8caf619763d118e067bb91d329c09e99f4a54a4
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_cuda_5e1630d
+ops = torch.ops._activation_cuda_5e1630d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_cuda_5e1630d::{op_name}"
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/activation/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/layers.py b/build/torch211-cxx11-cu128-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch211-cxx11-cu128-aarch64-linux/metadata.json b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b796af185fbbd8594fcd846949aa5fadc0ccdda
--- /dev/null
+++ b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_activation_cuda_5e1630d.abi3.so b/build/torch211-cxx11-cu128-x86_64-linux/_activation_cuda_5e1630d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..ac0456c8ad3c21d35e256011415655bdb7598cb5
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_activation_cuda_5e1630d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf04bec1ab934e3f8f0f60e94968067c85c4539daa4d2ffb345446debddf437a
+size 4395464
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_ops.py b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8caf619763d118e067bb91d329c09e99f4a54a4
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_cuda_5e1630d
+ops = torch.ops._activation_cuda_5e1630d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_cuda_5e1630d::{op_name}"
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/activation/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/layers.py b/build/torch211-cxx11-cu128-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch211-cxx11-cu128-x86_64-linux/metadata.json b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b796af185fbbd8594fcd846949aa5fadc0ccdda
--- /dev/null
+++ b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_activation_cuda_5e1630d.abi3.so b/build/torch211-cxx11-cu130-aarch64-linux/_activation_cuda_5e1630d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..b68f2b6f7accc327eea552330930caf3abaa0fa6
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_activation_cuda_5e1630d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e9bedacf7611cdde0c229d46ec84b159c62154e4053cd21db492118d8ccddf8
+size 4289720
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_ops.py b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8caf619763d118e067bb91d329c09e99f4a54a4
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_cuda_5e1630d
+ops = torch.ops._activation_cuda_5e1630d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_cuda_5e1630d::{op_name}"
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/activation/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/layers.py b/build/torch211-cxx11-cu130-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch211-cxx11-cu130-aarch64-linux/metadata.json b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..66651b7d3f95ac9e5ce5fc2a641b6f0f50788f87
--- /dev/null
+++ b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json
@@ -0,0 +1,19 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_activation_cuda_5e1630d.abi3.so b/build/torch211-cxx11-cu130-x86_64-linux/_activation_cuda_5e1630d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..c54cc49b26506d24d635da5fe1d54c7abc833401
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_activation_cuda_5e1630d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2fd7d9b1a41ac1bb3a64c392f13251390570a94f2021c0fcf8168ebd32e64099
+size 4183096
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_ops.py b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8caf619763d118e067bb91d329c09e99f4a54a4
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_cuda_5e1630d
+ops = torch.ops._activation_cuda_5e1630d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_cuda_5e1630d::{op_name}"
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/activation/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/layers.py b/build/torch211-cxx11-cu130-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch211-cxx11-cu130-x86_64-linux/metadata.json b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..66651b7d3f95ac9e5ce5fc2a641b6f0f50788f87
--- /dev/null
+++ b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,19 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bbf3ad846a76e365312ad965559a177976801396
Binary files /dev/null and b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47765ef8e985a500bbb3e25990387a1f1f15c767
Binary files /dev/null and b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de62862184381714910c79ecdf8db3ca14f8a753
Binary files /dev/null and b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_beeaae6.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_beeaae6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..c6c9665f880b574481be0f6464ac7637e732df84
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_beeaae6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce06ec284ecd4ac5423d3822a60cd9eeb686d0054b38d66567de73e1137b0567
+size 2773632
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d722bffa37106dd2bfdb75db14408c7eecefcb0
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_beeaae6
+ops = torch.ops._activation_beeaae6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_beeaae6::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py b/build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aec9c95fa75e4d3ff699ce69fc6618798b179c1
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py
@@ -0,0 +1,179 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29e76b5c619af9b19c5650edcfd4f63c4725d35f
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f54053b63e8c2b7598967b6ca9739ecc85d6142a
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d4a3c1172a3a2b4c954199c9762b3251d1c468c
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_beeaae6.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_beeaae6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..e9e9102689a8ddf42f881abedcd19e137f22d5e4
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_beeaae6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a529bd105aca5081398d63329e829b6b159570424cd654d3a9f275ca9a720e82
+size 2852200
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d722bffa37106dd2bfdb75db14408c7eecefcb0
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_beeaae6
+ops = torch.ops._activation_beeaae6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_beeaae6::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/layers.py b/build/torch27-cxx11-cu126-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aec9c95fa75e4d3ff699ce69fc6618798b179c1
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/layers.py
@@ -0,0 +1,179 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/activation/__init__.py b/build/torch27-cxx11-cu128-aarch64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch27-cxx11-cu128-aarch64-linux/activation/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu128-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..390f279894bed7ce9346ede4953b9ffc9e1b1808
Binary files /dev/null and b/build/torch27-cxx11-cu128-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu128-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86ca448fc1e6e7e119172b94f978b4a88aeda3e1
Binary files /dev/null and b/build/torch27-cxx11-cu128-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-cu128-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbcd1da77da3529c73226d8ed8decfae8b9e5436
Binary files /dev/null and b/build/torch27-cxx11-cu128-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/activation/_activation_320b408.abi3.so b/build/torch27-cxx11-cu128-aarch64-linux/activation/_activation_320b408.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..4df8f1606a76b66c06d538cd25db8e894d282405
--- /dev/null
+++ b/build/torch27-cxx11-cu128-aarch64-linux/activation/_activation_320b408.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34bdeb9ab72686850aef0a16b225b1b956162edb2cf46cba65c5e5b92ae267ae
+size 4207000
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/activation/_ops.py b/build/torch27-cxx11-cu128-aarch64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fe83704e6d8850cb94dd0434fb763bff8e7e953
--- /dev/null
+++ b/build/torch27-cxx11-cu128-aarch64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_320b408
+ops = torch.ops._activation_320b408
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_320b408::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-aarch64-linux/activation/layers.py b/build/torch27-cxx11-cu128-aarch64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aec9c95fa75e4d3ff699ce69fc6618798b179c1
--- /dev/null
+++ b/build/torch27-cxx11-cu128-aarch64-linux/activation/layers.py
@@ -0,0 +1,179 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..364976ff5017b183a827c0dfcda90becfbab0e7c
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..008e1b91db1ae539587989af1a212f9cd38a1ae2
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d00f03a5b9a4944132d13ac0986acc2c54e0ca3c
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_beeaae6.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_beeaae6.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..6d8adc0f26f3b10cbc1b441b74bc7f49c0ebdaae
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_beeaae6.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f2cffcb6b5b9a49f03a2df46fc2ad36765676edecb468c233e78e1f5e21e206
+size 4127872
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d722bffa37106dd2bfdb75db14408c7eecefcb0
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_beeaae6
+ops = torch.ops._activation_beeaae6
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_beeaae6::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/layers.py b/build/torch27-cxx11-cu128-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aec9c95fa75e4d3ff699ce69fc6618798b179c1
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/layers.py
@@ -0,0 +1,179 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/activation/__init__.py b/build/torch28-cxx11-cu126-aarch64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c4f207354093c6ef83eb5d7f3a5a3b22b95d357
--- /dev/null
+++ b/build/torch28-cxx11-cu126-aarch64-linux/activation/__init__.py
@@ -0,0 +1,57 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu126-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7aaa364368efe0e765de132c08296d189a969ede
Binary files /dev/null and b/build/torch28-cxx11-cu126-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu126-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc7b128cfd05527bc856b66cdaf7d33691835eae
Binary files /dev/null and b/build/torch28-cxx11-cu126-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch28-cxx11-cu126-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e547c241f319a637fa590b09ad35c1592aacce40
Binary files /dev/null and b/build/torch28-cxx11-cu126-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/activation/_activation_0c3eb4e_dirty.abi3.so b/build/torch28-cxx11-cu126-aarch64-linux/activation/_activation_0c3eb4e_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..8121d3da5057e1d53e4dee4b60de1e13285bd3e0
--- /dev/null
+++ b/build/torch28-cxx11-cu126-aarch64-linux/activation/_activation_0c3eb4e_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02b62f5d045f370c3fb7c0e7ef458165feb987fba186b8cb9aee55c735a82e93
+size 2699928
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/activation/_ops.py b/build/torch28-cxx11-cu126-aarch64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f883290f823dd4b9ad1432d6644d25bcd3a4acf
--- /dev/null
+++ b/build/torch28-cxx11-cu126-aarch64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_0c3eb4e_dirty
+ops = torch.ops._activation_0c3eb4e_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_0c3eb4e_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-aarch64-linux/activation/layers.py b/build/torch28-cxx11-cu126-aarch64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..45b31181ffb80509a85d729a7f7ee86fc2cf014a
--- /dev/null
+++ b/build/torch28-cxx11-cu126-aarch64-linux/activation/layers.py
@@ -0,0 +1,128 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_activation_f8d6759.abi3.so b/build/torch28-cxx11-cu126-x86_64-linux/_activation_f8d6759.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..cabcacd16040aad8134b2892ea8f1f9781a9a78b
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_activation_f8d6759.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf76431ff46ef5bc002ce8813eeed3ae9618a15094d98ef4b164f7a10a54f0bc
+size 3121056
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..140c6e96b3f93ce5b359648edac4dcb2913b8324
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_f8d6759
+ops = torch.ops._activation_f8d6759
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_f8d6759::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/activation/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/layers.py b/build/torch28-cxx11-cu126-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/metadata.json b/build/torch28-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/activation/__init__.py b/build/torch28-cxx11-cu128-aarch64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c4f207354093c6ef83eb5d7f3a5a3b22b95d357
--- /dev/null
+++ b/build/torch28-cxx11-cu128-aarch64-linux/activation/__init__.py
@@ -0,0 +1,57 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu128-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bdb5a121a09f628a672c404f5207f691347f83c5
Binary files /dev/null and b/build/torch28-cxx11-cu128-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu128-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d8c166048d114380e068ca6448ab46ef96da034
Binary files /dev/null and b/build/torch28-cxx11-cu128-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch28-cxx11-cu128-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea0551b7b1c5e408b9875b62598f6f5f0b489a30
Binary files /dev/null and b/build/torch28-cxx11-cu128-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/activation/_activation_0c3eb4e_dirty.abi3.so b/build/torch28-cxx11-cu128-aarch64-linux/activation/_activation_0c3eb4e_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..f1d23623c037de97ee0207fe5f750d8ba9863d3c
--- /dev/null
+++ b/build/torch28-cxx11-cu128-aarch64-linux/activation/_activation_0c3eb4e_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f7fe0a00eaf2e228f237ee3058ac9eb2c6fbc4927b1276d0f566bb05bb043b9
+size 3683080
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/activation/_ops.py b/build/torch28-cxx11-cu128-aarch64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f883290f823dd4b9ad1432d6644d25bcd3a4acf
--- /dev/null
+++ b/build/torch28-cxx11-cu128-aarch64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_0c3eb4e_dirty
+ops = torch.ops._activation_0c3eb4e_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_0c3eb4e_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-aarch64-linux/activation/layers.py b/build/torch28-cxx11-cu128-aarch64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..45b31181ffb80509a85d729a7f7ee86fc2cf014a
--- /dev/null
+++ b/build/torch28-cxx11-cu128-aarch64-linux/activation/layers.py
@@ -0,0 +1,128 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_activation_f8d6759.abi3.so b/build/torch28-cxx11-cu128-x86_64-linux/_activation_f8d6759.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..564e3aa415dbcea5a132bfb14301b4900373fb58
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_activation_f8d6759.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dc0a42d5ebcae09615265a3635bb90d33c76d9179fcfcec17fb2fc5cb16b7f5
+size 4400792
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..140c6e96b3f93ce5b359648edac4dcb2913b8324
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_f8d6759
+ops = torch.ops._activation_f8d6759
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_f8d6759::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/activation/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/layers.py b/build/torch28-cxx11-cu128-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/metadata.json b/build/torch28-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/activation/__init__.py b/build/torch28-cxx11-cu129-aarch64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch28-cxx11-cu129-aarch64-linux/activation/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu129-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e53c600baf751d47e3c75f0ea262aaa74cbaa2a0
Binary files /dev/null and b/build/torch28-cxx11-cu129-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu129-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cfe526dc3c92a5c7b1a46084e58d4448fc74b15b
Binary files /dev/null and b/build/torch28-cxx11-cu129-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch28-cxx11-cu129-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..878be1d140d35a1a92eb1b870cd3ccc0bbb65128
Binary files /dev/null and b/build/torch28-cxx11-cu129-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/activation/_activation_320b408.abi3.so b/build/torch28-cxx11-cu129-aarch64-linux/activation/_activation_320b408.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..485825618d1d0c2e93123fe5197999883b59b748
--- /dev/null
+++ b/build/torch28-cxx11-cu129-aarch64-linux/activation/_activation_320b408.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3af83bae80c8641200010ba586e5a2cac271fa4fcd344e3532ea7d5094fd7c17
+size 4275744
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/activation/_ops.py b/build/torch28-cxx11-cu129-aarch64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fe83704e6d8850cb94dd0434fb763bff8e7e953
--- /dev/null
+++ b/build/torch28-cxx11-cu129-aarch64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_320b408
+ops = torch.ops._activation_320b408
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_320b408::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-aarch64-linux/activation/layers.py b/build/torch28-cxx11-cu129-aarch64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aec9c95fa75e4d3ff699ce69fc6618798b179c1
--- /dev/null
+++ b/build/torch28-cxx11-cu129-aarch64-linux/activation/layers.py
@@ -0,0 +1,179 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_activation_f8d6759.abi3.so b/build/torch28-cxx11-cu129-x86_64-linux/_activation_f8d6759.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..ff0d1df159bdd317b6293331073a9aab2d4bd06c
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_activation_f8d6759.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48d0f9b82abd2e6d7154889814140b789e2d4452aac1296d921c9a2d4ab19e91
+size 4438672
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..140c6e96b3f93ce5b359648edac4dcb2913b8324
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_f8d6759
+ops = torch.ops._activation_f8d6759
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_f8d6759::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/activation/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/layers.py b/build/torch28-cxx11-cu129-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/metadata.json b/build/torch28-cxx11-cu129-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/activation/__init__.py b/build/torch29-cxx11-cu126-aarch64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/activation/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch29-cxx11-cu126-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60dc82724c779cfa41bd9b8dcf39c036e2a50109
Binary files /dev/null and b/build/torch29-cxx11-cu126-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch29-cxx11-cu126-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48cda67561066b31e84ee5ecebcf0ef61e1ad322
Binary files /dev/null and b/build/torch29-cxx11-cu126-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch29-cxx11-cu126-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0082ca0b0e28577622a3e430602fabe010369318
Binary files /dev/null and b/build/torch29-cxx11-cu126-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/activation/_activation_320b408.abi3.so b/build/torch29-cxx11-cu126-aarch64-linux/activation/_activation_320b408.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..41c75640cfdc7eeff3d57f4a6d403f7e7f10b8d8
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/activation/_activation_320b408.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9c24e0eb75a09a9fc19e7096276d560226f198617291681c1a18e94002a629e
+size 2963480
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/activation/_ops.py b/build/torch29-cxx11-cu126-aarch64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fe83704e6d8850cb94dd0434fb763bff8e7e953
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_320b408
+ops = torch.ops._activation_320b408
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_320b408::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu126-aarch64-linux/activation/layers.py b/build/torch29-cxx11-cu126-aarch64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aec9c95fa75e4d3ff699ce69fc6618798b179c1
--- /dev/null
+++ b/build/torch29-cxx11-cu126-aarch64-linux/activation/layers.py
@@ -0,0 +1,179 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_activation_63b875f.abi3.so b/build/torch29-cxx11-cu126-x86_64-linux/_activation_63b875f.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..3bb70b2a77f8c7dd8f0125e896cfca9359138ff9
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_activation_63b875f.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c84b682f2dd4437835661f57f031d96865871f6f4ab25f5651d4f577acee1326
+size 3121128
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/_ops.py b/build/torch29-cxx11-cu126-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..602229319b5ec8bd38c2cd107da58e1e9e968b8d
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_63b875f
+ops = torch.ops._activation_63b875f
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_63b875f::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/activation/__init__.py b/build/torch29-cxx11-cu126-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/layers.py b/build/torch29-cxx11-cu126-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch29-cxx11-cu126-x86_64-linux/metadata.json b/build/torch29-cxx11-cu126-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch29-cxx11-cu126-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/activation/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/activation/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch29-cxx11-cu128-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d338b4d5170fa0130189f67e65562998f8f42be
Binary files /dev/null and b/build/torch29-cxx11-cu128-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch29-cxx11-cu128-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be7ffd679d4afbc36ea076dbc57e3162a60bd409
Binary files /dev/null and b/build/torch29-cxx11-cu128-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch29-cxx11-cu128-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e50041e74611417f4e4037e568a9e041780a5e32
Binary files /dev/null and b/build/torch29-cxx11-cu128-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/activation/_activation_320b408.abi3.so b/build/torch29-cxx11-cu128-aarch64-linux/activation/_activation_320b408.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..dc83e4989904884309410757826ec095ea0fdfe4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/activation/_activation_320b408.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08ee3dfa4d481eaf44ac3c11a0843598c05950f779dba66abd468fecb7839b32
+size 4208760
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/activation/_ops.py b/build/torch29-cxx11-cu128-aarch64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fe83704e6d8850cb94dd0434fb763bff8e7e953
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_320b408
+ops = torch.ops._activation_320b408
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_320b408::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/activation/layers.py b/build/torch29-cxx11-cu128-aarch64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aec9c95fa75e4d3ff699ce69fc6618798b179c1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/activation/layers.py
@@ -0,0 +1,179 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_activation_63b875f.abi3.so b/build/torch29-cxx11-cu128-x86_64-linux/_activation_63b875f.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..e2e49fb0c5b136351663cc36a368639afff8a47c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_activation_63b875f.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3afed8f50b04121c408e2b7fc8f4920015ba696b97e54be8e165cbbdd7039d6b
+size 4400864
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/_ops.py b/build/torch29-cxx11-cu128-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..602229319b5ec8bd38c2cd107da58e1e9e968b8d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_63b875f
+ops = torch.ops._activation_63b875f
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_63b875f::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/activation/__init__.py b/build/torch29-cxx11-cu128-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/layers.py b/build/torch29-cxx11-cu128-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch29-cxx11-cu128-x86_64-linux/metadata.json b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_activation_cuda_5e1630d.abi3.so b/build/torch29-cxx11-cu129-aarch64-linux/_activation_cuda_5e1630d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..be95d14353d33115dffd14ed26748e19227084ff
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_activation_cuda_5e1630d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62c3d96bdf09677af2537ac4a9f4cf67a241c6bd4a2888771faaa9e16c0973f4
+size 4538112
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_ops.py b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8caf619763d118e067bb91d329c09e99f4a54a4
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_cuda_5e1630d
+ops = torch.ops._activation_cuda_5e1630d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_cuda_5e1630d::{op_name}"
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/activation/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/layers.py b/build/torch29-cxx11-cu129-aarch64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch29-cxx11-cu129-aarch64-linux/metadata.json b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b796af185fbbd8594fcd846949aa5fadc0ccdda
--- /dev/null
+++ b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_activation_cuda_5e1630d.abi3.so b/build/torch29-cxx11-cu129-x86_64-linux/_activation_cuda_5e1630d.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..388c9cd5d4f2977fc150126c3672e629aba36da2
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_activation_cuda_5e1630d.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99e5f0df9d07f3bc16feeaffb9863d669f677695c856045954c266c45246dc43
+size 4438768
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_ops.py b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8caf619763d118e067bb91d329c09e99f4a54a4
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_cuda_5e1630d
+ops = torch.ops._activation_cuda_5e1630d
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_cuda_5e1630d::{op_name}"
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/activation/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/layers.py b/build/torch29-cxx11-cu129-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch29-cxx11-cu129-x86_64-linux/metadata.json b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b796af185fbbd8594fcd846949aa5fadc0ccdda
--- /dev/null
+++ b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json
@@ -0,0 +1,21 @@
+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/activation/__init__.py b/build/torch29-cxx11-cu130-aarch64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/activation/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch29-cxx11-cu130-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21696c8710d6b717d92ebd34545a9ac97cc44942
Binary files /dev/null and b/build/torch29-cxx11-cu130-aarch64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch29-cxx11-cu130-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1856969205a3825653d4be5e4c267a9585ff6594
Binary files /dev/null and b/build/torch29-cxx11-cu130-aarch64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch29-cxx11-cu130-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8e0f48d49bb34730201d17d0795310d829e20cb
Binary files /dev/null and b/build/torch29-cxx11-cu130-aarch64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/activation/_activation_320b408.abi3.so b/build/torch29-cxx11-cu130-aarch64-linux/activation/_activation_320b408.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..02267d619c1ad4c0bb7f84b243e5456c6bf7c798
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/activation/_activation_320b408.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73748b54059552f5983322f7dedc36ed349b38ad6fb9318301bb4965b1fe49aa
+size 4094968
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/activation/_ops.py b/build/torch29-cxx11-cu130-aarch64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fe83704e6d8850cb94dd0434fb763bff8e7e953
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_320b408
+ops = torch.ops._activation_320b408
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_320b408::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu130-aarch64-linux/activation/layers.py b/build/torch29-cxx11-cu130-aarch64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0aec9c95fa75e4d3ff699ce69fc6618798b179c1
--- /dev/null
+++ b/build/torch29-cxx11-cu130-aarch64-linux/activation/layers.py
@@ -0,0 +1,179 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_activation_63b875f.abi3.so b/build/torch29-cxx11-cu130-x86_64-linux/_activation_63b875f.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..fcd78de80c8ca89e556f4f7255428b9dbbfaaf2d
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_activation_63b875f.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bc5534a57cbe83a6ebc51a13bce94eab7c06ce5b4d41eb4e9db83f77ae64902
+size 4180240
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/_ops.py b/build/torch29-cxx11-cu130-x86_64-linux/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..602229319b5ec8bd38c2cd107da58e1e9e968b8d
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_63b875f
+ops = torch.ops._activation_63b875f
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_63b875f::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/activation/__init__.py b/build/torch29-cxx11-cu130-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/layers.py b/build/torch29-cxx11-cu130-x86_64-linux/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch29-cxx11-cu130-x86_64-linux/metadata.json b/build/torch29-cxx11-cu130-x86_64-linux/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch29-cxx11-cu130-x86_64-linux/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/build/torch29-metal-aarch64-darwin/__init__.py b/build/torch29-metal-aarch64-darwin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a9cd15a0a75f95c5ab956fb05c2a9860f218156
--- /dev/null
+++ b/build/torch29-metal-aarch64-darwin/__init__.py
@@ -0,0 +1,75 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu(out, x)
+    return out
+
+def silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu(out, x)
+    return out
+
+
+def gelu_tanh(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh(out, x)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "mul_and_silu",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "gelu_tanh",
+    "silu",
+    "gelu",
+    "layers",
+]
diff --git a/build/torch29-metal-aarch64-darwin/_activation_63b875f.abi3.so b/build/torch29-metal-aarch64-darwin/_activation_63b875f.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..986b7947b413077b8d8acf3967a52ee556212268
--- /dev/null
+++ b/build/torch29-metal-aarch64-darwin/_activation_63b875f.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:884e887217a67931f5a59b3c39487acb754ff51282adb6b13b5db669e39cb12e
+size 220504
diff --git a/build/torch29-metal-aarch64-darwin/_ops.py b/build/torch29-metal-aarch64-darwin/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..602229319b5ec8bd38c2cd107da58e1e9e968b8d
--- /dev/null
+++ b/build/torch29-metal-aarch64-darwin/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_63b875f
+ops = torch.ops._activation_63b875f
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_63b875f::{op_name}"
\ No newline at end of file
diff --git a/build/torch29-metal-aarch64-darwin/activation/__init__.py b/build/torch29-metal-aarch64-darwin/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03dbc1afe1cf156661a2b1b22003cd5f599a0309
--- /dev/null
+++ b/build/torch29-metal-aarch64-darwin/activation/__init__.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/build/torch29-metal-aarch64-darwin/layers.py b/build/torch29-metal-aarch64-darwin/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f66f39d58561e0ff9d43eb943fac9e92e6a8259
--- /dev/null
+++ b/build/torch29-metal-aarch64-darwin/layers.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+class Silu(nn.Module):
+    """An activation function for SiLU.
+
+    The function computes x -> silu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.silu(out, x)
+        return out
+
+class Gelu(nn.Module):
+    """An activation function for GELU.
+
+    The function computes x -> gelu(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu(out, x)
+        return out
+
+class GeluTanh(nn.Module):
+    """An activation function for GELU with `tanh` approximation.
+
+    The function computes x -> gelu_tanh(x).
+
+    Shapes:
+        x: (num_tokens, d) or (batch_size, seq_len, d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_tanh(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        if not x.is_contiguous():
+            x = x.contiguous()
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch29-metal-aarch64-darwin/metadata.json b/build/torch29-metal-aarch64-darwin/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cf5deed9898dce769f4cc73913d3530b92a0bd8
--- /dev/null
+++ b/build/torch29-metal-aarch64-darwin/metadata.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file