drbh commited on 17 days ago

Commit

69b4990

unverified ·

0 Parent(s):

Migrated from kernels-community/finegrained-fp8

Browse files

Files changed (28) hide show

.gitattributes +35 -0
build/torch-cuda/__init__.py +32 -0
build/torch-cuda/_ops.py +8 -0
build/torch-cuda/act_quant.py +73 -0
build/torch-cuda/batched.py +390 -0
build/torch-cuda/finegrained_fp8/__init__.py +26 -0
build/torch-cuda/grouped.py +477 -0
build/torch-cuda/matmul.py +411 -0
build/torch-cuda/metadata.json +8 -0
build/torch-cuda/utils.py +13 -0
build/torch-rocm/__init__.py +32 -0
build/torch-rocm/_ops.py +8 -0
build/torch-rocm/act_quant.py +73 -0
build/torch-rocm/batched.py +390 -0
build/torch-rocm/finegrained_fp8/__init__.py +26 -0
build/torch-rocm/grouped.py +477 -0
build/torch-rocm/matmul.py +411 -0
build/torch-rocm/metadata.json +8 -0
build/torch-rocm/utils.py +13 -0
build/torch-xpu/__init__.py +32 -0
build/torch-xpu/_ops.py +8 -0
build/torch-xpu/act_quant.py +73 -0
build/torch-xpu/batched.py +390 -0
build/torch-xpu/finegrained_fp8/__init__.py +26 -0
build/torch-xpu/grouped.py +477 -0
build/torch-xpu/matmul.py +411 -0
build/torch-xpu/metadata.json +8 -0
build/torch-xpu/utils.py +13 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

build/torch-cuda/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from .act_quant import fp8_act_quant
+from .batched import (
+    w8a8_fp8_matmul_batched,
+    w8a8_block_fp8_matmul_batched,
+    w8a8_tensor_fp8_matmul_batched,
+)
+from .grouped import (
+    w8a8_fp8_matmul_grouped,
+    w8a8_block_fp8_matmul_grouped,
+    w8a8_tensor_fp8_matmul_grouped,
+)
+from .matmul import (
+    w8a8_fp8_matmul,
+    w8a8_block_fp8_matmul,
+    w8a8_tensor_fp8_matmul,
+)
+__all__ = [
+    "fp8_act_quant",
+    # Single matmul
+    "w8a8_fp8_matmul",
+    "w8a8_block_fp8_matmul",
+    "w8a8_tensor_fp8_matmul",
+    # Batched matmul
+    "w8a8_fp8_matmul_batched",
+    "w8a8_block_fp8_matmul_batched",
+    "w8a8_tensor_fp8_matmul_batched",
+    # Grouped matmul
+    "w8a8_fp8_matmul_grouped",
+    "w8a8_block_fp8_matmul_grouped",
+    "w8a8_tensor_fp8_matmul_grouped",
+]

build/torch-cuda/_ops.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+ops = torch.ops._finegrained_fp8_75cbe1b
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_finegrained_fp8_75cbe1b::{op_name}"

build/torch-cuda/act_quant.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import triton
+import triton.language as tl
+from torch.library import triton_op, wrap_triton
+from .utils import device_context
+_FP8_DTYPE = torch.float8_e4m3fn
+# Copied from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
+@triton.jit
+def _fp8_act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    x = tl.load(x_ptr + offs).to(tl.float32)
+    s = tl.max(tl.abs(x)) / 448.0  # float8_e4m3fn max
+    y = (x / s).to(y_ptr.dtype.element_ty)
+    tl.store(y_ptr + offs, y)
+    tl.store(s_ptr + pid, s)
+@triton_op("finegrained_fp8::fp8_act_quant", mutates_args=())
+def _fp8_act_quant(
+    x: torch.Tensor, block_size: int = 128
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.is_contiguous()
+    assert x.shape[-1] % block_size == 0
+    y = torch.empty_like(x, dtype=_FP8_DTYPE)
+    grid = (triton.cdiv(x.numel(), block_size),)
+    s = x.new_empty(*x.size()[:-1], x.size(-1) // block_size, dtype=torch.float32)
+    with device_context(x.device):
+        wrap_triton(_fp8_act_quant_kernel)[grid](x, y, s, BLOCK_SIZE=block_size)
+    return y, s
+def fp8_act_quant(
+    x: torch.Tensor, block_size: int = 128
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Quantize activations to FP8 with per-block dynamic scaling.
+    Splits the last dimension of ``x`` into blocks of ``block_size`` elements,
+    computes ``scale = max(|x_block|) / 448`` per block, and quantizes to
+    ``float8_e4m3fn``.
+    Args:
+        x: Input tensor in bf16/fp16/fp32. Last dimension must be divisible by
+            ``block_size`` and the tensor must be contiguous.
+        block_size: Number of elements per quantization block (default: 128).
+    Returns:
+        A tuple ``(quantized, scales)`` where ``quantized`` has dtype
+        ``float8_e4m3fn`` with the same shape as ``x``, and ``scales`` has
+        shape ``(*x.shape[:-1], x.shape[-1] // block_size)`` in float32.
+    """
+    return torch.ops.finegrained_fp8.fp8_act_quant(x, block_size)

build/torch-cuda/batched.py ADDED Viewed

	@@ -0,0 +1,390 @@

+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import triton
+import triton.language as tl
+from .act_quant import fp8_act_quant
+from torch.library import triton_op, wrap_triton
+from .utils import device_context
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4, 5]
+    ],
+    key=["N", "K"],
+)
+@triton.jit
+def w8a8_block_fp8_matmul_batched_kernel(
+    A,  # (S, K)  raw BF16/FP16 activations
+    B,  # (E, N, K) FP8 weight matrices
+    C,  # (S, N)  output
+    Bs,  # (E, N // BLOCK_SIZE_N, K // BLOCK_SIZE_K) weight scales
+    ExpertIds,  # (S,) — which expert each batch element routes to
+    # Shape
+    S,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_bs_e,
+    stride_bs_k,
+    stride_bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+):
+    """Block-scale batched FP8 expert matmul kernel.
+    Each program handles one routed token row and one N-tile, looks up the
+    owning expert from ``ExpertIds``, and applies fused activation quantization.
+    """
+    batch_id = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    # Cast expert_id to int64 to prevent int32 overflow when computing
+    # expert_id * stride_Eb (e.g. 255 * 9_437_184 > 2^31 for 256 experts of
+    # 3072×3072 FP8 weights).
+    expert_id = tl.load(ExpertIds + batch_id).to(tl.int64)
+    A = A + batch_id * stride_am
+    B = B + expert_id * stride_be
+    C = C + batch_id * stride_cm
+    Bs = Bs + expert_id * stride_bs_e
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + tl.arange(0, BLOCK_SIZE_M)[:, None] * 0 + offs_k[None, :] * stride_ak
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    bs_ptrs = Bs + pid_n * stride_bs_n
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # ---- fused fp8_act_quant ----
+        a_raw = tl.load(a_ptrs).to(tl.float32)
+        a_s = tl.max(tl.abs(a_raw)) / 448.0
+        a = (a_raw / tl.maximum(a_s, 1e-12)).to(tl.float8e4nv)
+        # ---- matmul ----
+        b = tl.load(b_ptrs)
+        b_s = tl.load(bs_ptrs + k * stride_bs_k)
+        accumulator += tl.dot(a, b) * a_s * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    offs_cm = tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + offs_cm[:, None] * 0 + stride_cn * offs_cn[None, :]
+    tl.store(c_ptrs, c)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4, 5]
+    ],
+    key=["N", "K"],
+)
+@triton.jit
+def w8a8_tensor_fp8_matmul_batched_kernel(
+    A,  # (S, K) pre-quantized FP8 activations
+    B,  # (E, N, K) FP8 weight matrices
+    C,  # (S, N) output
+    As,  # (S, 1) per-tensor activation scales
+    Bs,  # (E, 1, 1) per-tensor weight scales
+    ExpertIds,
+    S,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_as_m,
+    stride_bs_e,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+):
+    """Tensor-scale batched FP8 expert matmul kernel.
+    Activations are already quantized; the kernel applies per-token activation
+    scales and per-expert tensor weight scales.
+    """
+    batch_id = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    expert_id = tl.load(ExpertIds + batch_id).to(tl.int64)
+    A = A + batch_id * stride_am
+    B = B + expert_id * stride_be
+    C = C + batch_id * stride_cm
+    Bs = Bs + expert_id * stride_bs_e
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + tl.arange(0, BLOCK_SIZE_M)[:, None] * 0 + offs_k[None, :] * stride_ak
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    b_s = tl.load(Bs)
+    a_s = tl.load(As + batch_id * stride_as_m)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for _ in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs)
+        b = tl.load(b_ptrs)
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    accumulator = accumulator * a_s * b_s
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    offs_cm = tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + offs_cm[:, None] * 0 + stride_cn * offs_cn[None, :]
+    tl.store(c_ptrs, c)
+@triton_op("finegrained_fp8::w8a8_block_fp8_matmul_batched", mutates_args=())
+def _w8a8_block_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """Block-scale batched FP8 matmul: C[s] = A[s] @ B[expert_ids[s]].T, with fused act quant.
+    A:  (S, K) raw bf16/fp16 activations
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E, N // block_n, K // block_k) per-block weight scales
+    """
+    assert A.ndim == 2, f"A must be 2D (S, K), got ndim={A.ndim}"
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 3, f"B must be 3D (E, N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    assert A.shape[1] == B.shape[2], (
+        f"K mismatch: A has K={A.shape[1]}, B has K={B.shape[2]}"
+    )
+    S, K = A.shape
+    E, N, _ = B.shape
+    assert len(block_size) == 2, (
+        f"block_size must be [block_n, block_k], got {block_size}"
+    )
+    block_n, block_k = block_size[0], block_size[1]
+    # MoE expert dimensions must be block-aligned; non-aligned N/K is not supported.
+    assert N % block_n == 0, f"N ({N}) must be divisible by block_n ({block_n})"
+    assert K % block_k == 0, f"K ({K}) must be divisible by block_k ({block_k})"
+    assert Bs.ndim == 3, (
+        f"Bs must be 3D (E, N//block_n, K//block_k), got ndim={Bs.ndim}"
+    )
+    assert Bs.shape == (E, N // block_n, K // block_k), (
+        f"Bs shape {tuple(Bs.shape)} != expected ({E}, {N // block_n}, {K // block_k})"
+    )
+    C = A.new_empty(S, N)
+    # Adaptive BLOCK_SIZE_M: smallest power-of-2 >= M, floored at 16, capped at 128.
+    # Matches the WGMMA tile to the actual row count — smaller tiles use less
+    # register pressure and a better-matched FP8 WGMMA instruction, improving
+    # both accuracy and performance for small M (decode).
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2((S + E - 1) // E), 16), 128)
+    grid = (S, triton.cdiv(N, block_n))
+    with device_context(A.device):
+        wrap_triton(w8a8_block_fp8_matmul_batched_kernel)[grid](
+            A,
+            B,
+            C,
+            Bs,
+            expert_ids,
+            S,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+            Bs.stride(0),
+            Bs.stride(2),
+            Bs.stride(1),
+            BLOCK_SIZE_N=block_n,
+            BLOCK_SIZE_K=block_k,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+        )
+    return C
+@triton_op("finegrained_fp8::w8a8_tensor_fp8_matmul_batched", mutates_args=())
+def _w8a8_tensor_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+) -> torch.Tensor:
+    """Tensor-scale batched FP8 matmul: C[s] = A[s] @ B[expert_ids[s]].T, with fused act quant.
+    A:  (S, K) raw bf16/fp16 activations
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E,) or (E, 1, 1) per-expert weight scales
+    """
+    assert A.ndim == 2, f"A must be 2D (S, K), got ndim={A.ndim}"
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 3, f"B must be 3D (E, N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    assert A.shape[1] == B.shape[2], (
+        f"K mismatch: A has K={A.shape[1]}, B has K={B.shape[2]}"
+    )
+    S, K = A.shape
+    E, N, _ = B.shape
+    # Normalize Bs to (E, 1, 1)
+    if Bs.ndim == 1:
+        assert Bs.shape[0] == E, f"Bs shape {tuple(Bs.shape)} != expected ({E},)"
+        Bs = Bs.reshape(E, 1, 1)
+    else:
+        assert Bs.shape == (E, 1, 1), (
+            f"Bs shape {tuple(Bs.shape)} != expected ({E}, 1, 1)"
+        )
+    BLOCK_SIZE_N = 128
+    BLOCK_SIZE_K = 128
+    C = A.new_empty(S, N)
+    qA, As = fp8_act_quant(A, K)
+    grid = (S, triton.cdiv(N, BLOCK_SIZE_N))
+    # Adaptive BLOCK_SIZE_M: smallest power-of-2 >= M, floored at 16, capped at 128.
+    # Matches the WGMMA tile to the actual row count — smaller tiles use less
+    # register pressure and a better-matched FP8 WGMMA instruction, improving
+    # both accuracy and performance for small M (decode).
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2((S + E - 1) // E), 16), 128)
+    grid = (S, triton.cdiv(N, BLOCK_SIZE_N))
+    with device_context(A.device):
+        wrap_triton(w8a8_tensor_fp8_matmul_batched_kernel)[grid](
+            qA,
+            B,
+            C,
+            As,
+            Bs,
+            expert_ids,
+            S,
+            N,
+            K,
+            qA.stride(0),
+            qA.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+            As.stride(0),
+            Bs.stride(0),
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+        )
+    return C
+def w8a8_block_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """Block-scale batched FP8 matmul with fused activation quantization.
+    A:  (S, K) raw activations, bf16/fp16/fp32
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E, N // block_n, K // block_k) per-block weight scales
+    """
+    return torch.ops.finegrained_fp8.w8a8_block_fp8_matmul_batched(
+        A, B, Bs, expert_ids, block_size
+    )
+def w8a8_tensor_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+) -> torch.Tensor:
+    """Tensor-scale batched FP8 matmul with fused activation quantization.
+    A:  (S, K) raw activations, bf16/fp16/fp32
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E,) or (E, 1, 1) per-expert weight scales
+    """
+    return torch.ops.finegrained_fp8.w8a8_tensor_fp8_matmul_batched(
+        A, B, Bs, expert_ids
+    )
+def w8a8_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+    block_size: list[int] | None,
+) -> torch.Tensor:
+    """Unified batched W8A8 FP8 matmul dispatcher.
+    Dispatch rules:
+    - tensor mode when ``block_size is None``
+    - tensor mode when ``block_size == [N, K]``
+    - otherwise block mode
+    Returns:
+        Output tensor ``[S, N]`` in the same dtype as ``A``.
+    """
+    if block_size is None or (
+        block_size[0] == B.size(1) and block_size[1] == B.size(2)
+    ):
+        return w8a8_tensor_fp8_matmul_batched(A, B, Bs, expert_ids)
+    return w8a8_block_fp8_matmul_batched(A, B, Bs, expert_ids, block_size)

build/torch-cuda/finegrained_fp8/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch-cuda/grouped.py ADDED Viewed

	@@ -0,0 +1,477 @@

+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import triton
+import triton.language as tl
+from .act_quant import fp8_act_quant
+from torch.library import triton_op, wrap_triton
+from .utils import device_context
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4, 5]
+    ],
+    key=["N", "K", "BLOCK_SIZE_M"],
+)
+@triton.jit
+def w8a8_block_fp8_matmul_grouped_kernel(
+    A,  # (S, K)  raw BF16/FP16 activations, sorted/grouped by expert id
+    B,  # (E, N, K) FP8 weight matrices
+    C,  # (S, N)  output
+    Bs,  # (E, N // BLOCK_SIZE_N, K // BLOCK_SIZE_K) weight scales
+    Offsets,  # (E,) int32 — cumulative row-end per expert
+    TileOffsets,  # (E,) int32 — cumulative tile-end per expert
+    # Shape
+    S,
+    N,
+    K,
+    # Strides
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_bs_e,
+    stride_bs_k,
+    stride_bs_n,
+    # Meta-parameters
+    NUM_EXPERTS: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    NUM_EXPERTS_BIT_LENGTH: tl.constexpr,
+):
+    """Block-scale grouped FP8 expert matmul kernel.
+    Tokens are assumed sorted by expert. The kernel maps each M-tile to its
+    owning expert via ``TileOffsets`` and applies fused activation quantization.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    # Exit early for programs beyond the actual tile count.
+    total_tiles = tl.load(TileOffsets + NUM_EXPERTS - 1)
+    if pid_m >= total_tiles:
+        return
+    # Binary search in TileOffsets to find the owning expert.
+    # Finds the smallest e such that TileOffsets[e] > pid_m (upper_bound semantics),
+    # which is the expert whose tile range contains pid_m.
+    # O(log2(NUM_EXPERTS)) loads instead of the O(NUM_EXPERTS) linear scan.
+    # NUM_EXPERTS_BIT_LENGTH is ceil(log2(E))+1 for powers-of-two, giving one
+    # harmless extra iteration when lo==hi; it's a compile-time constant so the
+    # loop is fully unrolled by the compiler.
+    lo = 0
+    hi = NUM_EXPERTS
+    for _ in tl.static_range(NUM_EXPERTS_BIT_LENGTH):
+        mid = (lo + hi) >> 1
+        mid_val = tl.load(TileOffsets + mid)
+        is_left = mid_val <= pid_m
+        lo = tl.where(is_left, mid + 1, lo)
+        hi = tl.where(is_left, hi, mid)
+    # Cast expert_id to int64 to prevent int32 overflow when computing
+    # expert_id * stride_be (e.g. 255 * 9_437_184 > 2^31 for 256 experts of
+    # 3072×3072 FP8 weights).
+    expert_id = lo.to(tl.int64)
+    prev_eid = tl.maximum(expert_id - 1, 0)
+    expert_start = tl.where(expert_id == 0, 0, tl.load(Offsets + prev_eid))
+    expert_end = tl.load(Offsets + expert_id)
+    M_expert = expert_end - expert_start
+    expert_tile_start = tl.where(expert_id == 0, 0, tl.load(TileOffsets + prev_eid))
+    local_tile = pid_m - expert_tile_start
+    m_off = local_tile * BLOCK_SIZE_M
+    offs_am = m_off + tl.arange(0, BLOCK_SIZE_M)
+    row_mask = offs_am < M_expert
+    offs_global_m = expert_start + offs_am
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + offs_global_m[:, None] * stride_am + offs_k[None, :] * stride_ak
+    b_ptrs = (
+        B
+        + expert_id * stride_be
+        + offs_k[:, None] * stride_bk
+        + offs_bn[None, :] * stride_bn
+    )
+    bs_ptrs = Bs + expert_id * stride_bs_e + pid_n * stride_bs_n
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # ---- fused fp8_act_quant ----
+        a_raw = tl.load(a_ptrs, mask=row_mask[:, None], other=0.0).to(tl.float32)
+        a_s = tl.max(tl.abs(a_raw), axis=1) / 448.0
+        a = (a_raw / tl.maximum(a_s[:, None], 1e-12)).to(tl.float8e4nv)
+        # ---- matmul ----
+        b = tl.load(b_ptrs)
+        b_s = tl.load(bs_ptrs + k * stride_bs_k)
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    c_ptrs = C + stride_cm * offs_global_m[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = row_mask[:, None]
+    tl.store(c_ptrs, c, mask=c_mask)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4, 5]
+    ],
+    key=["N", "K", "BLOCK_SIZE_M"],
+)
+@triton.jit
+def w8a8_tensor_fp8_matmul_grouped_kernel(
+    A,  # (S, K) raw BF16/FP16 activations, sorted/grouped by expert idc
+    B,  # (E, N, K) FP8 weight matrices
+    C,  # (S, N) output
+    As,  # (S, 1) activation scales
+    Bs,  # (E, 1, 1) per-tensor weight scales
+    Offsets,
+    TileOffsets,
+    S,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_as_m,
+    stride_bs_e,
+    NUM_EXPERTS: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    NUM_EXPERTS_BIT_LENGTH: tl.constexpr,
+):
+    """Tensor-scale grouped FP8 expert matmul kernel.
+    Uses grouped expert scheduling with pre-quantized activations plus
+    per-token activation scales and per-expert tensor weight scales.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    total_tiles = tl.load(TileOffsets + NUM_EXPERTS - 1)
+    if pid_m >= total_tiles:
+        return
+    lo = 0
+    hi = NUM_EXPERTS
+    for _ in tl.static_range(NUM_EXPERTS_BIT_LENGTH):
+        mid = (lo + hi) >> 1
+        mid_val = tl.load(TileOffsets + mid)
+        is_left = mid_val <= pid_m
+        lo = tl.where(is_left, mid + 1, lo)
+        hi = tl.where(is_left, hi, mid)
+    expert_id = lo.to(tl.int64)
+    prev_eid = tl.maximum(expert_id - 1, 0)
+    expert_start = tl.where(expert_id == 0, 0, tl.load(Offsets + prev_eid))
+    expert_end = tl.load(Offsets + expert_id)
+    M_expert = expert_end - expert_start
+    expert_tile_start = tl.where(expert_id == 0, 0, tl.load(TileOffsets + prev_eid))
+    local_tile = pid_m - expert_tile_start
+    m_off = local_tile * BLOCK_SIZE_M
+    offs_am = m_off + tl.arange(0, BLOCK_SIZE_M)
+    row_mask = offs_am < M_expert
+    offs_global_m = expert_start + offs_am
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + offs_global_m[:, None] * stride_am + offs_k[None, :] * stride_ak
+    b_ptrs = (
+        B
+        + expert_id * stride_be
+        + offs_k[:, None] * stride_bk
+        + offs_bn[None, :] * stride_bn
+    )
+    a_s = tl.load(As + offs_global_m * stride_as_m, mask=row_mask, other=0.0)
+    b_s = tl.load(Bs + expert_id * stride_bs_e)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for _ in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs, mask=row_mask[:, None], other=0.0)
+        b = tl.load(b_ptrs)
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    accumulator = accumulator * a_s[:, None] * b_s
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    c_ptrs = C + stride_cm * offs_global_m[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = row_mask[:, None]
+    tl.store(c_ptrs, c, mask=c_mask)
+@triton_op("finegrained_fp8::w8a8_block_fp8_matmul_grouped", mutates_args=())
+def _w8a8_block_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """Block-scale grouped FP8 matmul: C = A @ B.T per expert, with fused act quant.
+    A:  (S, K) raw bf16/fp16 activations, sorted by expert
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E, N // block_n, K // block_k) per-block weight scales
+    """
+    assert A.ndim == 2, f"A must be 2D (S, K), got ndim={A.ndim}"
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 3, f"B must be 3D (E, N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    assert A.shape[1] == B.shape[2], (
+        f"K mismatch: A has K={A.shape[1]}, B has K={B.shape[2]}"
+    )
+    S, K = A.shape
+    E, N, _ = B.shape
+    assert len(block_size) == 2, (
+        f"block_size must be [block_n, block_k], got {block_size}"
+    )
+    block_n, block_k = block_size[0], block_size[1]
+    # MoE expert dimensions must be block-aligned; non-aligned N/K is not supported.
+    assert N % block_n == 0, f"N ({N}) must be divisible by block_n ({block_n})"
+    assert K % block_k == 0, f"K ({K}) must be divisible by block_k ({block_k})"
+    assert Bs.ndim == 3, (
+        f"Bs must be 3D (E, N//block_n, K//block_k), got ndim={Bs.ndim}"
+    )
+    assert Bs.shape == (E, N // block_n, K // block_k), (
+        f"Bs shape {tuple(Bs.shape)} != expected ({E}, {N // block_n}, {K // block_k})"
+    )
+    C = A.new_empty(S, N)
+    # Adaptive BLOCK_SIZE_M: match tile to average tokens per expert.
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2((S + E - 1) // E), 16), 128)
+    tiles_per_expert = (tokens_per_expert + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
+    tile_offsets = torch.cumsum(tiles_per_expert, dim=0).to(torch.int32)
+    # Upper bound on M-tiles: sum_e ceil(M_e / BLOCK_M) <= ceil(S / BLOCK_M) + E.
+    # Programs beyond the real tile count exit immediately via the early-return
+    # guard inside the kernel. This is faster than syncing for the exact count
+    # and keeps the grid size data-independent (cuda-graph / torch.compile safe).
+    max_M_tiles = triton.cdiv(S, BLOCK_SIZE_M) + E
+    grid = (max_M_tiles, triton.cdiv(N, block_n))
+    with device_context(A.device):
+        wrap_triton(w8a8_block_fp8_matmul_grouped_kernel)[grid](
+            A,
+            B,
+            C,
+            Bs,
+            offsets,
+            tile_offsets,
+            S,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+            Bs.stride(0),
+            Bs.stride(2),
+            Bs.stride(1),
+            # Meta-parameters
+            NUM_EXPERTS=E,
+            BLOCK_SIZE_N=block_n,
+            BLOCK_SIZE_K=block_k,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            NUM_EXPERTS_BIT_LENGTH=E.bit_length(),
+        )
+    return C
+@triton_op("finegrained_fp8::w8a8_tensor_fp8_matmul_grouped", mutates_args=())
+def _w8a8_tensor_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+) -> torch.Tensor:
+    """Tensor-scale grouped FP8 matmul: C = A @ B.T per expert, with fused act quant.
+    A:  (S, K) raw bf16/fp16 activations, sorted by expert
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E,) or (E, 1, 1) per-expert weight scales
+    """
+    assert A.ndim == 2, f"A must be 2D (S, K), got ndim={A.ndim}"
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 3, f"B must be 3D (E, N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    assert A.shape[1] == B.shape[2], (
+        f"K mismatch: A has K={A.shape[1]}, B has K={B.shape[2]}"
+    )
+    S, K = A.shape
+    E, N, _ = B.shape
+    # Normalize Bs to (E, 1, 1)
+    if Bs.ndim == 1:
+        assert Bs.shape[0] == E, f"Bs shape {tuple(Bs.shape)} != expected ({E},)"
+        Bs = Bs.reshape(E, 1, 1)
+    else:
+        assert Bs.shape == (E, 1, 1), (
+            f"Bs shape {tuple(Bs.shape)} != expected ({E}, 1, 1)"
+        )
+    BLOCK_SIZE_N = 128
+    BLOCK_SIZE_K = 128
+    C = A.new_empty(S, N)
+    qA, As = fp8_act_quant(A, K)
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2((S + E - 1) // E), 16), 128)
+    tiles_per_expert = (tokens_per_expert + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
+    tile_offsets = torch.cumsum(tiles_per_expert, dim=0).to(torch.int32)
+    # Upper bound on M-tiles: sum_e ceil(M_e / BLOCK_M) <= ceil(S / BLOCK_M) + E.
+    # Programs beyond the real tile count exit immediately via the early-return
+    # guard inside the kernel. This is faster than syncing for the exact count
+    # and keeps the grid size data-independent (cuda-graph / torch.compile safe).
+    max_M_tiles = triton.cdiv(S, BLOCK_SIZE_M) + E
+    grid = (max_M_tiles, triton.cdiv(N, BLOCK_SIZE_N))
+    with device_context(A.device):
+        wrap_triton(w8a8_tensor_fp8_matmul_grouped_kernel)[grid](
+            qA,
+            B,
+            C,
+            As,
+            Bs,
+            offsets,
+            tile_offsets,
+            S,
+            N,
+            K,
+            qA.stride(0),
+            qA.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+            As.stride(0),
+            Bs.stride(0),
+            # Meta-parameters
+            NUM_EXPERTS=E,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            NUM_EXPERTS_BIT_LENGTH=E.bit_length(),
+        )
+    return C
+def w8a8_block_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """Block-scale grouped FP8 matmul with fused activation quantization.
+    A:  (S, K) raw activations sorted by expert, bf16/fp16/fp32
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E, N // block_n, K // block_k) per-block weight scales
+    """
+    return torch.ops.finegrained_fp8.w8a8_block_fp8_matmul_grouped(
+        A, B, Bs, offsets, tokens_per_expert, block_size
+    )
+def w8a8_tensor_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+) -> torch.Tensor:
+    """Tensor-scale grouped FP8 matmul with fused activation quantization.
+    A:  (S, K) raw activations sorted by expert, bf16/fp16/fp32
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E,) or (E, 1, 1) per-expert weight scales
+    """
+    return torch.ops.finegrained_fp8.w8a8_tensor_fp8_matmul_grouped(
+        A, B, Bs, offsets, tokens_per_expert
+    )
+def w8a8_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    block_size: list[int] | None,
+) -> torch.Tensor:
+    """Unified grouped W8A8 FP8 matmul dispatcher.
+    Dispatch rules:
+    - tensor mode when ``block_size is None``
+    - tensor mode when ``block_size == [N, K]``
+    - otherwise block mode
+    Returns:
+        Output tensor ``[S, N]`` in the same dtype as ``A``, in expert-sorted order.
+    """
+    if block_size is None or (
+        block_size[0] == B.size(1) and block_size[1] == B.size(2)
+    ):
+        return w8a8_tensor_fp8_matmul_grouped(A, B, Bs, offsets, tokens_per_expert)
+    return w8a8_block_fp8_matmul_grouped(
+        A, B, Bs, offsets, tokens_per_expert, block_size
+    )

build/torch-cuda/matmul.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import triton
+import triton.language as tl
+from torch.library import triton_op, wrap_triton
+from .utils import device_context
+# Adapted from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/layers/quantization/fp8_kernel.py
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4]
+    ],
+    key=["N", "K", "BLOCK_SIZE_M"],
+)
+@triton.jit
+def w8a8_block_fp8_matmul_kernel(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_as_m,
+    stride_as_k,
+    stride_bs_k,
+    stride_bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Block-scale FP8 GEMM kernel.
+    Computes ``C = A @ B.T`` with block-wise activation/weight scales.
+    Uses a 2D grid with swizzle for L2 cache locality on B tiles.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M)
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    as_ptrs = As + offs_am * stride_as_m
+    offs_bsn = offs_bn // BLOCK_SIZE_N
+    bs_ptrs = Bs + offs_bsn * stride_bs_n
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_remaining = K - k * BLOCK_SIZE_K
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < k_remaining, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
+        a_s = tl.load(as_ptrs + k * stride_as_k)
+        b_s = tl.load(bs_ptrs + k * stride_bs_k)
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4]
+    ],
+    key=["N", "K", "BLOCK_SIZE_M"],
+)
+@triton.jit
+def w8a8_tensor_fp8_matmul_kernel(
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    M,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_as_m,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Tensor-scale FP8 GEMM kernel.
+    Computes ``C = A @ B.T`` with one activation scale per row and one
+    weight scale for the full matrix.
+    Uses a 2D grid with swizzle for L2 cache locality on B tiles.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M)
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
+    b_ptrs = B + offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn
+    a_s = tl.load(As + offs_am * stride_as_m)
+    b_s = tl.load(Bs)
+    # Accumulate raw dot products, apply scales once after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_remaining = K - k * BLOCK_SIZE_K
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < k_remaining, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    accumulator = accumulator * a_s[:, None] * b_s
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+@triton_op("finegrained_fp8::w8a8_block_fp8_matmul", mutates_args=())
+def _w8a8_block_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Block-scale FP8 matmul: C = A @ B.T with per-block scales.
+    As: (M, K // block_k) — per-token-group activation scales
+    Bs: (N // block_n, K // block_k) — per-block weight scales
+    """
+    assert len(block_size) == 2, (
+        f"block_size must be [block_n, block_k], got {block_size}"
+    )
+    block_n, block_k = block_size[0], block_size[1]
+    assert A.shape[-1] == B.shape[-1], (
+        f"K mismatch: A has K={A.shape[-1]}, B has K={B.shape[-1]}"
+    )
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 2, f"B must be 2D (N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    N, K = B.shape
+    M = A.numel() // A.shape[-1]
+    assert As.ndim >= 2, f"As must be at least 2D, got ndim={As.ndim}"
+    assert As.shape[-1] == triton.cdiv(K, block_k), (
+        f"As last dim {As.shape[-1]} != expected {triton.cdiv(K, block_k)} (cdiv(K={K}, block_k={block_k}))"
+    )
+    assert Bs.ndim == 2, f"Bs must be 2D (N//block_n, K//block_k), got ndim={Bs.ndim}"
+    assert Bs.shape == (triton.cdiv(N, block_n), triton.cdiv(K, block_k)), (
+        f"Bs shape {tuple(Bs.shape)} != expected ({triton.cdiv(N, block_n)}, {triton.cdiv(K, block_k)})"
+    )
+    BLOCK_SIZE_K = block_k
+    BLOCK_SIZE_N = block_n
+    C_shape = A.shape[:-1] + (N,)
+    C = A.new_empty(C_shape, dtype=output_dtype)
+    # Adaptive BLOCK_SIZE_M: smallest power-of-2 >= M, floored at 16, capped at 128.
+    # Matches the WGMMA tile to the actual row count — smaller tiles use less
+    # register pressure and a better-matched FP8 WGMMA instruction, improving
+    # both accuracy and performance for small M (decode).
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2(M), 16), 128)
+    grid = (triton.cdiv(M, BLOCK_SIZE_M), triton.cdiv(N, BLOCK_SIZE_N))
+    with device_context(A.device):
+        wrap_triton(w8a8_block_fp8_matmul_kernel)[grid](
+            A,
+            B,
+            C,
+            As,
+            Bs,
+            M,
+            N,
+            K,
+            A.stride(-2),
+            A.stride(-1),
+            B.stride(1),
+            B.stride(0),
+            C.stride(-2),
+            C.stride(-1),
+            As.stride(-2),
+            As.stride(-1),
+            Bs.stride(1),
+            Bs.stride(0),
+            # Meta-parameters
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            GROUP_SIZE_M=8,
+        )
+    return C
+@triton_op("finegrained_fp8::w8a8_tensor_fp8_matmul", mutates_args=())
+def _w8a8_tensor_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Tensor-scale FP8 matmul: C = A @ B.T with per-row / per-tensor scales.
+    As: scalar, (M,), or (M, 1) — per-row activation scales
+    Bs: scalar, (1,), or (1, 1) — single weight scale
+    """
+    assert A.shape[-1] == B.shape[-1], (
+        f"K mismatch: A has K={A.shape[-1]}, B has K={B.shape[-1]}"
+    )
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 2, f"B must be 2D (N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    N, K = B.shape
+    M = A.numel() // A.shape[-1]
+    # Normalize As to (M,)
+    if As.numel() == 1:
+        As = As.reshape(1).expand(M).contiguous()
+    elif As.ndim == 2:
+        As = As.reshape(M)
+    assert As.ndim == 1 and As.shape[0] == M, (
+        f"As must be scalar, (M,), or (M,1) with M={M}, got {tuple(As.shape)}"
+    )
+    # Normalize Bs to (1,)
+    assert Bs.numel() == 1, f"Bs must be scalar or (1,), got {tuple(Bs.shape)}"
+    Bs = Bs.reshape(1)
+    BLOCK_SIZE_N = 128
+    BLOCK_SIZE_K = 128
+    C_shape = A.shape[:-1] + (N,)
+    C = A.new_empty(C_shape, dtype=output_dtype)
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2(M), 16), 128)
+    grid = (triton.cdiv(M, BLOCK_SIZE_M), triton.cdiv(N, BLOCK_SIZE_N))
+    with device_context(A.device):
+        wrap_triton(w8a8_tensor_fp8_matmul_kernel)[grid](
+            A,
+            B,
+            C,
+            As,
+            Bs,
+            M,
+            N,
+            K,
+            A.stride(-2),
+            A.stride(-1),
+            B.stride(1),
+            B.stride(0),
+            C.stride(-2),
+            C.stride(-1),
+            As.stride(0),
+            # Meta-parameters
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            GROUP_SIZE_M=8,
+        )
+    return C
+def w8a8_block_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Block-wise W8A8 FP8 matrix multiplication.
+    Computes ``C = A @ B.T`` where both operands are pre-quantized to
+    ``float8_e4m3fn`` with per-block scales, and accumulates in float32
+    before casting to ``output_dtype``.
+    Args:
+        A: Quantized activation tensor ``[M, K]`` in ``float8_e4m3fn``.
+        B: Quantized weight tensor ``[N, K]`` in ``float8_e4m3fn``.
+        As: Per-token-group activation scales ``[M, K // block_size[1]]``.
+        Bs: Per-block weight scales ``[N // block_size[0], K // block_size[1]]``.
+        block_size: ``[block_n, block_k]`` quantization block dimensions, e.g. ``[128, 128]``.
+        output_dtype: dtype of the returned tensor (default: ``torch.float32``).
+    Returns:
+        Output tensor ``[M, N]`` in ``output_dtype``.
+    """
+    return torch.ops.finegrained_fp8.w8a8_block_fp8_matmul(
+        A, B, As, Bs, block_size, output_dtype
+    )
+def w8a8_tensor_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Tensor-scale W8A8 FP8 matrix multiplication.
+    Computes ``C = A @ B.T`` in tensor-scale mode using pre-quantized FP8
+    activations/weights and tensor scales.
+    Args:
+        A: Quantized activation tensor ``[M, K]`` in ``float8_e4m3fn``.
+        B: Quantized weight tensor ``[N, K]`` in ``float8_e4m3fn``.
+        As: Per-row activation scales ``[M]``.
+        Bs: Single weight scale, scalar or ``[1]``.
+        output_dtype: dtype of the returned tensor.
+    Returns:
+        Output tensor ``[M, N]`` in ``output_dtype``.
+    """
+    return torch.ops.finegrained_fp8.w8a8_tensor_fp8_matmul(A, B, As, Bs, output_dtype)
+def w8a8_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int] | None,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Unified W8A8 FP8 matmul dispatcher.
+    Dispatch rules:
+    - tensor mode when ``block_size is None``
+    - tensor mode when ``block_size == [N, K]``
+    - otherwise block mode
+    Returns:
+        Output tensor ``[M, N]`` in ``output_dtype``.
+    """
+    if block_size is None or (
+        block_size[0] == B.size(0) and block_size[1] == B.size(1)
+    ):
+        return w8a8_tensor_fp8_matmul(A, B, As, Bs, output_dtype)
+    return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype)

build/torch-cuda/metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda"
+  }
+}

build/torch-cuda/utils.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch
+from contextlib import contextmanager
+@contextmanager
+def device_context(device: torch.device):
+    """Context manager that sets the active device for any backend (cuda, xpu, etc.)."""
+    backend = getattr(torch, device.type, None)
+    if backend is not None and hasattr(backend, "device"):
+        with backend.device(device):
+            yield
+    else:
+        yield

build/torch-rocm/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from .act_quant import fp8_act_quant
+from .batched import (
+    w8a8_fp8_matmul_batched,
+    w8a8_block_fp8_matmul_batched,
+    w8a8_tensor_fp8_matmul_batched,
+)
+from .grouped import (
+    w8a8_fp8_matmul_grouped,
+    w8a8_block_fp8_matmul_grouped,
+    w8a8_tensor_fp8_matmul_grouped,
+)
+from .matmul import (
+    w8a8_fp8_matmul,
+    w8a8_block_fp8_matmul,
+    w8a8_tensor_fp8_matmul,
+)
+__all__ = [
+    "fp8_act_quant",
+    # Single matmul
+    "w8a8_fp8_matmul",
+    "w8a8_block_fp8_matmul",
+    "w8a8_tensor_fp8_matmul",
+    # Batched matmul
+    "w8a8_fp8_matmul_batched",
+    "w8a8_block_fp8_matmul_batched",
+    "w8a8_tensor_fp8_matmul_batched",
+    # Grouped matmul
+    "w8a8_fp8_matmul_grouped",
+    "w8a8_block_fp8_matmul_grouped",
+    "w8a8_tensor_fp8_matmul_grouped",
+]

build/torch-rocm/_ops.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+ops = torch.ops._finegrained_fp8_75cbe1b
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_finegrained_fp8_75cbe1b::{op_name}"

build/torch-rocm/act_quant.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import triton
+import triton.language as tl
+from torch.library import triton_op, wrap_triton
+from .utils import device_context
+_FP8_DTYPE = torch.float8_e4m3fn
+# Copied from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
+@triton.jit
+def _fp8_act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    x = tl.load(x_ptr + offs).to(tl.float32)
+    s = tl.max(tl.abs(x)) / 448.0  # float8_e4m3fn max
+    y = (x / s).to(y_ptr.dtype.element_ty)
+    tl.store(y_ptr + offs, y)
+    tl.store(s_ptr + pid, s)
+@triton_op("finegrained_fp8::fp8_act_quant", mutates_args=())
+def _fp8_act_quant(
+    x: torch.Tensor, block_size: int = 128
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.is_contiguous()
+    assert x.shape[-1] % block_size == 0
+    y = torch.empty_like(x, dtype=_FP8_DTYPE)
+    grid = (triton.cdiv(x.numel(), block_size),)
+    s = x.new_empty(*x.size()[:-1], x.size(-1) // block_size, dtype=torch.float32)
+    with device_context(x.device):
+        wrap_triton(_fp8_act_quant_kernel)[grid](x, y, s, BLOCK_SIZE=block_size)
+    return y, s
+def fp8_act_quant(
+    x: torch.Tensor, block_size: int = 128
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Quantize activations to FP8 with per-block dynamic scaling.
+    Splits the last dimension of ``x`` into blocks of ``block_size`` elements,
+    computes ``scale = max(|x_block|) / 448`` per block, and quantizes to
+    ``float8_e4m3fn``.
+    Args:
+        x: Input tensor in bf16/fp16/fp32. Last dimension must be divisible by
+            ``block_size`` and the tensor must be contiguous.
+        block_size: Number of elements per quantization block (default: 128).
+    Returns:
+        A tuple ``(quantized, scales)`` where ``quantized`` has dtype
+        ``float8_e4m3fn`` with the same shape as ``x``, and ``scales`` has
+        shape ``(*x.shape[:-1], x.shape[-1] // block_size)`` in float32.
+    """
+    return torch.ops.finegrained_fp8.fp8_act_quant(x, block_size)

build/torch-rocm/batched.py ADDED Viewed

	@@ -0,0 +1,390 @@

+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import triton
+import triton.language as tl
+from .act_quant import fp8_act_quant
+from torch.library import triton_op, wrap_triton
+from .utils import device_context
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4, 5]
+    ],
+    key=["N", "K"],
+)
+@triton.jit
+def w8a8_block_fp8_matmul_batched_kernel(
+    A,  # (S, K)  raw BF16/FP16 activations
+    B,  # (E, N, K) FP8 weight matrices
+    C,  # (S, N)  output
+    Bs,  # (E, N // BLOCK_SIZE_N, K // BLOCK_SIZE_K) weight scales
+    ExpertIds,  # (S,) — which expert each batch element routes to
+    # Shape
+    S,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_bs_e,
+    stride_bs_k,
+    stride_bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+):
+    """Block-scale batched FP8 expert matmul kernel.
+    Each program handles one routed token row and one N-tile, looks up the
+    owning expert from ``ExpertIds``, and applies fused activation quantization.
+    """
+    batch_id = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    # Cast expert_id to int64 to prevent int32 overflow when computing
+    # expert_id * stride_Eb (e.g. 255 * 9_437_184 > 2^31 for 256 experts of
+    # 3072×3072 FP8 weights).
+    expert_id = tl.load(ExpertIds + batch_id).to(tl.int64)
+    A = A + batch_id * stride_am
+    B = B + expert_id * stride_be
+    C = C + batch_id * stride_cm
+    Bs = Bs + expert_id * stride_bs_e
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + tl.arange(0, BLOCK_SIZE_M)[:, None] * 0 + offs_k[None, :] * stride_ak
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    bs_ptrs = Bs + pid_n * stride_bs_n
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # ---- fused fp8_act_quant ----
+        a_raw = tl.load(a_ptrs).to(tl.float32)
+        a_s = tl.max(tl.abs(a_raw)) / 448.0
+        a = (a_raw / tl.maximum(a_s, 1e-12)).to(tl.float8e4nv)
+        # ---- matmul ----
+        b = tl.load(b_ptrs)
+        b_s = tl.load(bs_ptrs + k * stride_bs_k)
+        accumulator += tl.dot(a, b) * a_s * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    offs_cm = tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + offs_cm[:, None] * 0 + stride_cn * offs_cn[None, :]
+    tl.store(c_ptrs, c)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4, 5]
+    ],
+    key=["N", "K"],
+)
+@triton.jit
+def w8a8_tensor_fp8_matmul_batched_kernel(
+    A,  # (S, K) pre-quantized FP8 activations
+    B,  # (E, N, K) FP8 weight matrices
+    C,  # (S, N) output
+    As,  # (S, 1) per-tensor activation scales
+    Bs,  # (E, 1, 1) per-tensor weight scales
+    ExpertIds,
+    S,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_as_m,
+    stride_bs_e,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+):
+    """Tensor-scale batched FP8 expert matmul kernel.
+    Activations are already quantized; the kernel applies per-token activation
+    scales and per-expert tensor weight scales.
+    """
+    batch_id = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    expert_id = tl.load(ExpertIds + batch_id).to(tl.int64)
+    A = A + batch_id * stride_am
+    B = B + expert_id * stride_be
+    C = C + batch_id * stride_cm
+    Bs = Bs + expert_id * stride_bs_e
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + tl.arange(0, BLOCK_SIZE_M)[:, None] * 0 + offs_k[None, :] * stride_ak
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    b_s = tl.load(Bs)
+    a_s = tl.load(As + batch_id * stride_as_m)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for _ in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs)
+        b = tl.load(b_ptrs)
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    accumulator = accumulator * a_s * b_s
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    offs_cm = tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + offs_cm[:, None] * 0 + stride_cn * offs_cn[None, :]
+    tl.store(c_ptrs, c)
+@triton_op("finegrained_fp8::w8a8_block_fp8_matmul_batched", mutates_args=())
+def _w8a8_block_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """Block-scale batched FP8 matmul: C[s] = A[s] @ B[expert_ids[s]].T, with fused act quant.
+    A:  (S, K) raw bf16/fp16 activations
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E, N // block_n, K // block_k) per-block weight scales
+    """
+    assert A.ndim == 2, f"A must be 2D (S, K), got ndim={A.ndim}"
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 3, f"B must be 3D (E, N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    assert A.shape[1] == B.shape[2], (
+        f"K mismatch: A has K={A.shape[1]}, B has K={B.shape[2]}"
+    )
+    S, K = A.shape
+    E, N, _ = B.shape
+    assert len(block_size) == 2, (
+        f"block_size must be [block_n, block_k], got {block_size}"
+    )
+    block_n, block_k = block_size[0], block_size[1]
+    # MoE expert dimensions must be block-aligned; non-aligned N/K is not supported.
+    assert N % block_n == 0, f"N ({N}) must be divisible by block_n ({block_n})"
+    assert K % block_k == 0, f"K ({K}) must be divisible by block_k ({block_k})"
+    assert Bs.ndim == 3, (
+        f"Bs must be 3D (E, N//block_n, K//block_k), got ndim={Bs.ndim}"
+    )
+    assert Bs.shape == (E, N // block_n, K // block_k), (
+        f"Bs shape {tuple(Bs.shape)} != expected ({E}, {N // block_n}, {K // block_k})"
+    )
+    C = A.new_empty(S, N)
+    # Adaptive BLOCK_SIZE_M: smallest power-of-2 >= M, floored at 16, capped at 128.
+    # Matches the WGMMA tile to the actual row count — smaller tiles use less
+    # register pressure and a better-matched FP8 WGMMA instruction, improving
+    # both accuracy and performance for small M (decode).
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2((S + E - 1) // E), 16), 128)
+    grid = (S, triton.cdiv(N, block_n))
+    with device_context(A.device):
+        wrap_triton(w8a8_block_fp8_matmul_batched_kernel)[grid](
+            A,
+            B,
+            C,
+            Bs,
+            expert_ids,
+            S,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+            Bs.stride(0),
+            Bs.stride(2),
+            Bs.stride(1),
+            BLOCK_SIZE_N=block_n,
+            BLOCK_SIZE_K=block_k,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+        )
+    return C
+@triton_op("finegrained_fp8::w8a8_tensor_fp8_matmul_batched", mutates_args=())
+def _w8a8_tensor_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+) -> torch.Tensor:
+    """Tensor-scale batched FP8 matmul: C[s] = A[s] @ B[expert_ids[s]].T, with fused act quant.
+    A:  (S, K) raw bf16/fp16 activations
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E,) or (E, 1, 1) per-expert weight scales
+    """
+    assert A.ndim == 2, f"A must be 2D (S, K), got ndim={A.ndim}"
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 3, f"B must be 3D (E, N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    assert A.shape[1] == B.shape[2], (
+        f"K mismatch: A has K={A.shape[1]}, B has K={B.shape[2]}"
+    )
+    S, K = A.shape
+    E, N, _ = B.shape
+    # Normalize Bs to (E, 1, 1)
+    if Bs.ndim == 1:
+        assert Bs.shape[0] == E, f"Bs shape {tuple(Bs.shape)} != expected ({E},)"
+        Bs = Bs.reshape(E, 1, 1)
+    else:
+        assert Bs.shape == (E, 1, 1), (
+            f"Bs shape {tuple(Bs.shape)} != expected ({E}, 1, 1)"
+        )
+    BLOCK_SIZE_N = 128
+    BLOCK_SIZE_K = 128
+    C = A.new_empty(S, N)
+    qA, As = fp8_act_quant(A, K)
+    grid = (S, triton.cdiv(N, BLOCK_SIZE_N))
+    # Adaptive BLOCK_SIZE_M: smallest power-of-2 >= M, floored at 16, capped at 128.
+    # Matches the WGMMA tile to the actual row count — smaller tiles use less
+    # register pressure and a better-matched FP8 WGMMA instruction, improving
+    # both accuracy and performance for small M (decode).
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2((S + E - 1) // E), 16), 128)
+    grid = (S, triton.cdiv(N, BLOCK_SIZE_N))
+    with device_context(A.device):
+        wrap_triton(w8a8_tensor_fp8_matmul_batched_kernel)[grid](
+            qA,
+            B,
+            C,
+            As,
+            Bs,
+            expert_ids,
+            S,
+            N,
+            K,
+            qA.stride(0),
+            qA.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+            As.stride(0),
+            Bs.stride(0),
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+        )
+    return C
+def w8a8_block_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """Block-scale batched FP8 matmul with fused activation quantization.
+    A:  (S, K) raw activations, bf16/fp16/fp32
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E, N // block_n, K // block_k) per-block weight scales
+    """
+    return torch.ops.finegrained_fp8.w8a8_block_fp8_matmul_batched(
+        A, B, Bs, expert_ids, block_size
+    )
+def w8a8_tensor_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+) -> torch.Tensor:
+    """Tensor-scale batched FP8 matmul with fused activation quantization.
+    A:  (S, K) raw activations, bf16/fp16/fp32
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E,) or (E, 1, 1) per-expert weight scales
+    """
+    return torch.ops.finegrained_fp8.w8a8_tensor_fp8_matmul_batched(
+        A, B, Bs, expert_ids
+    )
+def w8a8_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+    block_size: list[int] | None,
+) -> torch.Tensor:
+    """Unified batched W8A8 FP8 matmul dispatcher.
+    Dispatch rules:
+    - tensor mode when ``block_size is None``
+    - tensor mode when ``block_size == [N, K]``
+    - otherwise block mode
+    Returns:
+        Output tensor ``[S, N]`` in the same dtype as ``A``.
+    """
+    if block_size is None or (
+        block_size[0] == B.size(1) and block_size[1] == B.size(2)
+    ):
+        return w8a8_tensor_fp8_matmul_batched(A, B, Bs, expert_ids)
+    return w8a8_block_fp8_matmul_batched(A, B, Bs, expert_ids, block_size)

build/torch-rocm/finegrained_fp8/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch-rocm/grouped.py ADDED Viewed

	@@ -0,0 +1,477 @@

+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import triton
+import triton.language as tl
+from .act_quant import fp8_act_quant
+from torch.library import triton_op, wrap_triton
+from .utils import device_context
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4, 5]
+    ],
+    key=["N", "K", "BLOCK_SIZE_M"],
+)
+@triton.jit
+def w8a8_block_fp8_matmul_grouped_kernel(
+    A,  # (S, K)  raw BF16/FP16 activations, sorted/grouped by expert id
+    B,  # (E, N, K) FP8 weight matrices
+    C,  # (S, N)  output
+    Bs,  # (E, N // BLOCK_SIZE_N, K // BLOCK_SIZE_K) weight scales
+    Offsets,  # (E,) int32 — cumulative row-end per expert
+    TileOffsets,  # (E,) int32 — cumulative tile-end per expert
+    # Shape
+    S,
+    N,
+    K,
+    # Strides
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_bs_e,
+    stride_bs_k,
+    stride_bs_n,
+    # Meta-parameters
+    NUM_EXPERTS: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    NUM_EXPERTS_BIT_LENGTH: tl.constexpr,
+):
+    """Block-scale grouped FP8 expert matmul kernel.
+    Tokens are assumed sorted by expert. The kernel maps each M-tile to its
+    owning expert via ``TileOffsets`` and applies fused activation quantization.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    # Exit early for programs beyond the actual tile count.
+    total_tiles = tl.load(TileOffsets + NUM_EXPERTS - 1)
+    if pid_m >= total_tiles:
+        return
+    # Binary search in TileOffsets to find the owning expert.
+    # Finds the smallest e such that TileOffsets[e] > pid_m (upper_bound semantics),
+    # which is the expert whose tile range contains pid_m.
+    # O(log2(NUM_EXPERTS)) loads instead of the O(NUM_EXPERTS) linear scan.
+    # NUM_EXPERTS_BIT_LENGTH is ceil(log2(E))+1 for powers-of-two, giving one
+    # harmless extra iteration when lo==hi; it's a compile-time constant so the
+    # loop is fully unrolled by the compiler.
+    lo = 0
+    hi = NUM_EXPERTS
+    for _ in tl.static_range(NUM_EXPERTS_BIT_LENGTH):
+        mid = (lo + hi) >> 1
+        mid_val = tl.load(TileOffsets + mid)
+        is_left = mid_val <= pid_m
+        lo = tl.where(is_left, mid + 1, lo)
+        hi = tl.where(is_left, hi, mid)
+    # Cast expert_id to int64 to prevent int32 overflow when computing
+    # expert_id * stride_be (e.g. 255 * 9_437_184 > 2^31 for 256 experts of
+    # 3072×3072 FP8 weights).
+    expert_id = lo.to(tl.int64)
+    prev_eid = tl.maximum(expert_id - 1, 0)
+    expert_start = tl.where(expert_id == 0, 0, tl.load(Offsets + prev_eid))
+    expert_end = tl.load(Offsets + expert_id)
+    M_expert = expert_end - expert_start
+    expert_tile_start = tl.where(expert_id == 0, 0, tl.load(TileOffsets + prev_eid))
+    local_tile = pid_m - expert_tile_start
+    m_off = local_tile * BLOCK_SIZE_M
+    offs_am = m_off + tl.arange(0, BLOCK_SIZE_M)
+    row_mask = offs_am < M_expert
+    offs_global_m = expert_start + offs_am
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + offs_global_m[:, None] * stride_am + offs_k[None, :] * stride_ak
+    b_ptrs = (
+        B
+        + expert_id * stride_be
+        + offs_k[:, None] * stride_bk
+        + offs_bn[None, :] * stride_bn
+    )
+    bs_ptrs = Bs + expert_id * stride_bs_e + pid_n * stride_bs_n
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # ---- fused fp8_act_quant ----
+        a_raw = tl.load(a_ptrs, mask=row_mask[:, None], other=0.0).to(tl.float32)
+        a_s = tl.max(tl.abs(a_raw), axis=1) / 448.0
+        a = (a_raw / tl.maximum(a_s[:, None], 1e-12)).to(tl.float8e4nv)
+        # ---- matmul ----
+        b = tl.load(b_ptrs)
+        b_s = tl.load(bs_ptrs + k * stride_bs_k)
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    c_ptrs = C + stride_cm * offs_global_m[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = row_mask[:, None]
+    tl.store(c_ptrs, c, mask=c_mask)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4, 5]
+    ],
+    key=["N", "K", "BLOCK_SIZE_M"],
+)
+@triton.jit
+def w8a8_tensor_fp8_matmul_grouped_kernel(
+    A,  # (S, K) raw BF16/FP16 activations, sorted/grouped by expert idc
+    B,  # (E, N, K) FP8 weight matrices
+    C,  # (S, N) output
+    As,  # (S, 1) activation scales
+    Bs,  # (E, 1, 1) per-tensor weight scales
+    Offsets,
+    TileOffsets,
+    S,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_as_m,
+    stride_bs_e,
+    NUM_EXPERTS: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    NUM_EXPERTS_BIT_LENGTH: tl.constexpr,
+):
+    """Tensor-scale grouped FP8 expert matmul kernel.
+    Uses grouped expert scheduling with pre-quantized activations plus
+    per-token activation scales and per-expert tensor weight scales.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    total_tiles = tl.load(TileOffsets + NUM_EXPERTS - 1)
+    if pid_m >= total_tiles:
+        return
+    lo = 0
+    hi = NUM_EXPERTS
+    for _ in tl.static_range(NUM_EXPERTS_BIT_LENGTH):
+        mid = (lo + hi) >> 1
+        mid_val = tl.load(TileOffsets + mid)
+        is_left = mid_val <= pid_m
+        lo = tl.where(is_left, mid + 1, lo)
+        hi = tl.where(is_left, hi, mid)
+    expert_id = lo.to(tl.int64)
+    prev_eid = tl.maximum(expert_id - 1, 0)
+    expert_start = tl.where(expert_id == 0, 0, tl.load(Offsets + prev_eid))
+    expert_end = tl.load(Offsets + expert_id)
+    M_expert = expert_end - expert_start
+    expert_tile_start = tl.where(expert_id == 0, 0, tl.load(TileOffsets + prev_eid))
+    local_tile = pid_m - expert_tile_start
+    m_off = local_tile * BLOCK_SIZE_M
+    offs_am = m_off + tl.arange(0, BLOCK_SIZE_M)
+    row_mask = offs_am < M_expert
+    offs_global_m = expert_start + offs_am
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + offs_global_m[:, None] * stride_am + offs_k[None, :] * stride_ak
+    b_ptrs = (
+        B
+        + expert_id * stride_be
+        + offs_k[:, None] * stride_bk
+        + offs_bn[None, :] * stride_bn
+    )
+    a_s = tl.load(As + offs_global_m * stride_as_m, mask=row_mask, other=0.0)
+    b_s = tl.load(Bs + expert_id * stride_bs_e)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for _ in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs, mask=row_mask[:, None], other=0.0)
+        b = tl.load(b_ptrs)
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    accumulator = accumulator * a_s[:, None] * b_s
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    c_ptrs = C + stride_cm * offs_global_m[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = row_mask[:, None]
+    tl.store(c_ptrs, c, mask=c_mask)
+@triton_op("finegrained_fp8::w8a8_block_fp8_matmul_grouped", mutates_args=())
+def _w8a8_block_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """Block-scale grouped FP8 matmul: C = A @ B.T per expert, with fused act quant.
+    A:  (S, K) raw bf16/fp16 activations, sorted by expert
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E, N // block_n, K // block_k) per-block weight scales
+    """
+    assert A.ndim == 2, f"A must be 2D (S, K), got ndim={A.ndim}"
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 3, f"B must be 3D (E, N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    assert A.shape[1] == B.shape[2], (
+        f"K mismatch: A has K={A.shape[1]}, B has K={B.shape[2]}"
+    )
+    S, K = A.shape
+    E, N, _ = B.shape
+    assert len(block_size) == 2, (
+        f"block_size must be [block_n, block_k], got {block_size}"
+    )
+    block_n, block_k = block_size[0], block_size[1]
+    # MoE expert dimensions must be block-aligned; non-aligned N/K is not supported.
+    assert N % block_n == 0, f"N ({N}) must be divisible by block_n ({block_n})"
+    assert K % block_k == 0, f"K ({K}) must be divisible by block_k ({block_k})"
+    assert Bs.ndim == 3, (
+        f"Bs must be 3D (E, N//block_n, K//block_k), got ndim={Bs.ndim}"
+    )
+    assert Bs.shape == (E, N // block_n, K // block_k), (
+        f"Bs shape {tuple(Bs.shape)} != expected ({E}, {N // block_n}, {K // block_k})"
+    )
+    C = A.new_empty(S, N)
+    # Adaptive BLOCK_SIZE_M: match tile to average tokens per expert.
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2((S + E - 1) // E), 16), 128)
+    tiles_per_expert = (tokens_per_expert + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
+    tile_offsets = torch.cumsum(tiles_per_expert, dim=0).to(torch.int32)
+    # Upper bound on M-tiles: sum_e ceil(M_e / BLOCK_M) <= ceil(S / BLOCK_M) + E.
+    # Programs beyond the real tile count exit immediately via the early-return
+    # guard inside the kernel. This is faster than syncing for the exact count
+    # and keeps the grid size data-independent (cuda-graph / torch.compile safe).
+    max_M_tiles = triton.cdiv(S, BLOCK_SIZE_M) + E
+    grid = (max_M_tiles, triton.cdiv(N, block_n))
+    with device_context(A.device):
+        wrap_triton(w8a8_block_fp8_matmul_grouped_kernel)[grid](
+            A,
+            B,
+            C,
+            Bs,
+            offsets,
+            tile_offsets,
+            S,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+            Bs.stride(0),
+            Bs.stride(2),
+            Bs.stride(1),
+            # Meta-parameters
+            NUM_EXPERTS=E,
+            BLOCK_SIZE_N=block_n,
+            BLOCK_SIZE_K=block_k,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            NUM_EXPERTS_BIT_LENGTH=E.bit_length(),
+        )
+    return C
+@triton_op("finegrained_fp8::w8a8_tensor_fp8_matmul_grouped", mutates_args=())
+def _w8a8_tensor_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+) -> torch.Tensor:
+    """Tensor-scale grouped FP8 matmul: C = A @ B.T per expert, with fused act quant.
+    A:  (S, K) raw bf16/fp16 activations, sorted by expert
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E,) or (E, 1, 1) per-expert weight scales
+    """
+    assert A.ndim == 2, f"A must be 2D (S, K), got ndim={A.ndim}"
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 3, f"B must be 3D (E, N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    assert A.shape[1] == B.shape[2], (
+        f"K mismatch: A has K={A.shape[1]}, B has K={B.shape[2]}"
+    )
+    S, K = A.shape
+    E, N, _ = B.shape
+    # Normalize Bs to (E, 1, 1)
+    if Bs.ndim == 1:
+        assert Bs.shape[0] == E, f"Bs shape {tuple(Bs.shape)} != expected ({E},)"
+        Bs = Bs.reshape(E, 1, 1)
+    else:
+        assert Bs.shape == (E, 1, 1), (
+            f"Bs shape {tuple(Bs.shape)} != expected ({E}, 1, 1)"
+        )
+    BLOCK_SIZE_N = 128
+    BLOCK_SIZE_K = 128
+    C = A.new_empty(S, N)
+    qA, As = fp8_act_quant(A, K)
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2((S + E - 1) // E), 16), 128)
+    tiles_per_expert = (tokens_per_expert + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
+    tile_offsets = torch.cumsum(tiles_per_expert, dim=0).to(torch.int32)
+    # Upper bound on M-tiles: sum_e ceil(M_e / BLOCK_M) <= ceil(S / BLOCK_M) + E.
+    # Programs beyond the real tile count exit immediately via the early-return
+    # guard inside the kernel. This is faster than syncing for the exact count
+    # and keeps the grid size data-independent (cuda-graph / torch.compile safe).
+    max_M_tiles = triton.cdiv(S, BLOCK_SIZE_M) + E
+    grid = (max_M_tiles, triton.cdiv(N, BLOCK_SIZE_N))
+    with device_context(A.device):
+        wrap_triton(w8a8_tensor_fp8_matmul_grouped_kernel)[grid](
+            qA,
+            B,
+            C,
+            As,
+            Bs,
+            offsets,
+            tile_offsets,
+            S,
+            N,
+            K,
+            qA.stride(0),
+            qA.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+            As.stride(0),
+            Bs.stride(0),
+            # Meta-parameters
+            NUM_EXPERTS=E,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            NUM_EXPERTS_BIT_LENGTH=E.bit_length(),
+        )
+    return C
+def w8a8_block_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """Block-scale grouped FP8 matmul with fused activation quantization.
+    A:  (S, K) raw activations sorted by expert, bf16/fp16/fp32
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E, N // block_n, K // block_k) per-block weight scales
+    """
+    return torch.ops.finegrained_fp8.w8a8_block_fp8_matmul_grouped(
+        A, B, Bs, offsets, tokens_per_expert, block_size
+    )
+def w8a8_tensor_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+) -> torch.Tensor:
+    """Tensor-scale grouped FP8 matmul with fused activation quantization.
+    A:  (S, K) raw activations sorted by expert, bf16/fp16/fp32
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E,) or (E, 1, 1) per-expert weight scales
+    """
+    return torch.ops.finegrained_fp8.w8a8_tensor_fp8_matmul_grouped(
+        A, B, Bs, offsets, tokens_per_expert
+    )
+def w8a8_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    block_size: list[int] | None,
+) -> torch.Tensor:
+    """Unified grouped W8A8 FP8 matmul dispatcher.
+    Dispatch rules:
+    - tensor mode when ``block_size is None``
+    - tensor mode when ``block_size == [N, K]``
+    - otherwise block mode
+    Returns:
+        Output tensor ``[S, N]`` in the same dtype as ``A``, in expert-sorted order.
+    """
+    if block_size is None or (
+        block_size[0] == B.size(1) and block_size[1] == B.size(2)
+    ):
+        return w8a8_tensor_fp8_matmul_grouped(A, B, Bs, offsets, tokens_per_expert)
+    return w8a8_block_fp8_matmul_grouped(
+        A, B, Bs, offsets, tokens_per_expert, block_size
+    )

build/torch-rocm/matmul.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import triton
+import triton.language as tl
+from torch.library import triton_op, wrap_triton
+from .utils import device_context
+# Adapted from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/layers/quantization/fp8_kernel.py
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4]
+    ],
+    key=["N", "K", "BLOCK_SIZE_M"],
+)
+@triton.jit
+def w8a8_block_fp8_matmul_kernel(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_as_m,
+    stride_as_k,
+    stride_bs_k,
+    stride_bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Block-scale FP8 GEMM kernel.
+    Computes ``C = A @ B.T`` with block-wise activation/weight scales.
+    Uses a 2D grid with swizzle for L2 cache locality on B tiles.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M)
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    as_ptrs = As + offs_am * stride_as_m
+    offs_bsn = offs_bn // BLOCK_SIZE_N
+    bs_ptrs = Bs + offs_bsn * stride_bs_n
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_remaining = K - k * BLOCK_SIZE_K
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < k_remaining, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
+        a_s = tl.load(as_ptrs + k * stride_as_k)
+        b_s = tl.load(bs_ptrs + k * stride_bs_k)
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4]
+    ],
+    key=["N", "K", "BLOCK_SIZE_M"],
+)
+@triton.jit
+def w8a8_tensor_fp8_matmul_kernel(
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    M,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_as_m,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Tensor-scale FP8 GEMM kernel.
+    Computes ``C = A @ B.T`` with one activation scale per row and one
+    weight scale for the full matrix.
+    Uses a 2D grid with swizzle for L2 cache locality on B tiles.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M)
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
+    b_ptrs = B + offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn
+    a_s = tl.load(As + offs_am * stride_as_m)
+    b_s = tl.load(Bs)
+    # Accumulate raw dot products, apply scales once after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_remaining = K - k * BLOCK_SIZE_K
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < k_remaining, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    accumulator = accumulator * a_s[:, None] * b_s
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+@triton_op("finegrained_fp8::w8a8_block_fp8_matmul", mutates_args=())
+def _w8a8_block_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Block-scale FP8 matmul: C = A @ B.T with per-block scales.
+    As: (M, K // block_k) — per-token-group activation scales
+    Bs: (N // block_n, K // block_k) — per-block weight scales
+    """
+    assert len(block_size) == 2, (
+        f"block_size must be [block_n, block_k], got {block_size}"
+    )
+    block_n, block_k = block_size[0], block_size[1]
+    assert A.shape[-1] == B.shape[-1], (
+        f"K mismatch: A has K={A.shape[-1]}, B has K={B.shape[-1]}"
+    )
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 2, f"B must be 2D (N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    N, K = B.shape
+    M = A.numel() // A.shape[-1]
+    assert As.ndim >= 2, f"As must be at least 2D, got ndim={As.ndim}"
+    assert As.shape[-1] == triton.cdiv(K, block_k), (
+        f"As last dim {As.shape[-1]} != expected {triton.cdiv(K, block_k)} (cdiv(K={K}, block_k={block_k}))"
+    )
+    assert Bs.ndim == 2, f"Bs must be 2D (N//block_n, K//block_k), got ndim={Bs.ndim}"
+    assert Bs.shape == (triton.cdiv(N, block_n), triton.cdiv(K, block_k)), (
+        f"Bs shape {tuple(Bs.shape)} != expected ({triton.cdiv(N, block_n)}, {triton.cdiv(K, block_k)})"
+    )
+    BLOCK_SIZE_K = block_k
+    BLOCK_SIZE_N = block_n
+    C_shape = A.shape[:-1] + (N,)
+    C = A.new_empty(C_shape, dtype=output_dtype)
+    # Adaptive BLOCK_SIZE_M: smallest power-of-2 >= M, floored at 16, capped at 128.
+    # Matches the WGMMA tile to the actual row count — smaller tiles use less
+    # register pressure and a better-matched FP8 WGMMA instruction, improving
+    # both accuracy and performance for small M (decode).
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2(M), 16), 128)
+    grid = (triton.cdiv(M, BLOCK_SIZE_M), triton.cdiv(N, BLOCK_SIZE_N))
+    with device_context(A.device):
+        wrap_triton(w8a8_block_fp8_matmul_kernel)[grid](
+            A,
+            B,
+            C,
+            As,
+            Bs,
+            M,
+            N,
+            K,
+            A.stride(-2),
+            A.stride(-1),
+            B.stride(1),
+            B.stride(0),
+            C.stride(-2),
+            C.stride(-1),
+            As.stride(-2),
+            As.stride(-1),
+            Bs.stride(1),
+            Bs.stride(0),
+            # Meta-parameters
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            GROUP_SIZE_M=8,
+        )
+    return C
+@triton_op("finegrained_fp8::w8a8_tensor_fp8_matmul", mutates_args=())
+def _w8a8_tensor_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Tensor-scale FP8 matmul: C = A @ B.T with per-row / per-tensor scales.
+    As: scalar, (M,), or (M, 1) — per-row activation scales
+    Bs: scalar, (1,), or (1, 1) — single weight scale
+    """
+    assert A.shape[-1] == B.shape[-1], (
+        f"K mismatch: A has K={A.shape[-1]}, B has K={B.shape[-1]}"
+    )
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 2, f"B must be 2D (N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    N, K = B.shape
+    M = A.numel() // A.shape[-1]
+    # Normalize As to (M,)
+    if As.numel() == 1:
+        As = As.reshape(1).expand(M).contiguous()
+    elif As.ndim == 2:
+        As = As.reshape(M)
+    assert As.ndim == 1 and As.shape[0] == M, (
+        f"As must be scalar, (M,), or (M,1) with M={M}, got {tuple(As.shape)}"
+    )
+    # Normalize Bs to (1,)
+    assert Bs.numel() == 1, f"Bs must be scalar or (1,), got {tuple(Bs.shape)}"
+    Bs = Bs.reshape(1)
+    BLOCK_SIZE_N = 128
+    BLOCK_SIZE_K = 128
+    C_shape = A.shape[:-1] + (N,)
+    C = A.new_empty(C_shape, dtype=output_dtype)
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2(M), 16), 128)
+    grid = (triton.cdiv(M, BLOCK_SIZE_M), triton.cdiv(N, BLOCK_SIZE_N))
+    with device_context(A.device):
+        wrap_triton(w8a8_tensor_fp8_matmul_kernel)[grid](
+            A,
+            B,
+            C,
+            As,
+            Bs,
+            M,
+            N,
+            K,
+            A.stride(-2),
+            A.stride(-1),
+            B.stride(1),
+            B.stride(0),
+            C.stride(-2),
+            C.stride(-1),
+            As.stride(0),
+            # Meta-parameters
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            GROUP_SIZE_M=8,
+        )
+    return C
+def w8a8_block_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Block-wise W8A8 FP8 matrix multiplication.
+    Computes ``C = A @ B.T`` where both operands are pre-quantized to
+    ``float8_e4m3fn`` with per-block scales, and accumulates in float32
+    before casting to ``output_dtype``.
+    Args:
+        A: Quantized activation tensor ``[M, K]`` in ``float8_e4m3fn``.
+        B: Quantized weight tensor ``[N, K]`` in ``float8_e4m3fn``.
+        As: Per-token-group activation scales ``[M, K // block_size[1]]``.
+        Bs: Per-block weight scales ``[N // block_size[0], K // block_size[1]]``.
+        block_size: ``[block_n, block_k]`` quantization block dimensions, e.g. ``[128, 128]``.
+        output_dtype: dtype of the returned tensor (default: ``torch.float32``).
+    Returns:
+        Output tensor ``[M, N]`` in ``output_dtype``.
+    """
+    return torch.ops.finegrained_fp8.w8a8_block_fp8_matmul(
+        A, B, As, Bs, block_size, output_dtype
+    )
+def w8a8_tensor_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Tensor-scale W8A8 FP8 matrix multiplication.
+    Computes ``C = A @ B.T`` in tensor-scale mode using pre-quantized FP8
+    activations/weights and tensor scales.
+    Args:
+        A: Quantized activation tensor ``[M, K]`` in ``float8_e4m3fn``.
+        B: Quantized weight tensor ``[N, K]`` in ``float8_e4m3fn``.
+        As: Per-row activation scales ``[M]``.
+        Bs: Single weight scale, scalar or ``[1]``.
+        output_dtype: dtype of the returned tensor.
+    Returns:
+        Output tensor ``[M, N]`` in ``output_dtype``.
+    """
+    return torch.ops.finegrained_fp8.w8a8_tensor_fp8_matmul(A, B, As, Bs, output_dtype)
+def w8a8_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int] | None,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Unified W8A8 FP8 matmul dispatcher.
+    Dispatch rules:
+    - tensor mode when ``block_size is None``
+    - tensor mode when ``block_size == [N, K]``
+    - otherwise block mode
+    Returns:
+        Output tensor ``[M, N]`` in ``output_dtype``.
+    """
+    if block_size is None or (
+        block_size[0] == B.size(0) and block_size[1] == B.size(1)
+    ):
+        return w8a8_tensor_fp8_matmul(A, B, As, Bs, output_dtype)
+    return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype)

build/torch-rocm/metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "rocm"
+  }
+}

build/torch-rocm/utils.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch
+from contextlib import contextmanager
+@contextmanager
+def device_context(device: torch.device):
+    """Context manager that sets the active device for any backend (cuda, xpu, etc.)."""
+    backend = getattr(torch, device.type, None)
+    if backend is not None and hasattr(backend, "device"):
+        with backend.device(device):
+            yield
+    else:
+        yield

build/torch-xpu/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from .act_quant import fp8_act_quant
+from .batched import (
+    w8a8_fp8_matmul_batched,
+    w8a8_block_fp8_matmul_batched,
+    w8a8_tensor_fp8_matmul_batched,
+)
+from .grouped import (
+    w8a8_fp8_matmul_grouped,
+    w8a8_block_fp8_matmul_grouped,
+    w8a8_tensor_fp8_matmul_grouped,
+)
+from .matmul import (
+    w8a8_fp8_matmul,
+    w8a8_block_fp8_matmul,
+    w8a8_tensor_fp8_matmul,
+)
+__all__ = [
+    "fp8_act_quant",
+    # Single matmul
+    "w8a8_fp8_matmul",
+    "w8a8_block_fp8_matmul",
+    "w8a8_tensor_fp8_matmul",
+    # Batched matmul
+    "w8a8_fp8_matmul_batched",
+    "w8a8_block_fp8_matmul_batched",
+    "w8a8_tensor_fp8_matmul_batched",
+    # Grouped matmul
+    "w8a8_fp8_matmul_grouped",
+    "w8a8_block_fp8_matmul_grouped",
+    "w8a8_tensor_fp8_matmul_grouped",
+]

build/torch-xpu/_ops.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+ops = torch.ops._finegrained_fp8_75cbe1b
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_finegrained_fp8_75cbe1b::{op_name}"

build/torch-xpu/act_quant.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import triton
+import triton.language as tl
+from torch.library import triton_op, wrap_triton
+from .utils import device_context
+_FP8_DTYPE = torch.float8_e4m3fn
+# Copied from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
+@triton.jit
+def _fp8_act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    x = tl.load(x_ptr + offs).to(tl.float32)
+    s = tl.max(tl.abs(x)) / 448.0  # float8_e4m3fn max
+    y = (x / s).to(y_ptr.dtype.element_ty)
+    tl.store(y_ptr + offs, y)
+    tl.store(s_ptr + pid, s)
+@triton_op("finegrained_fp8::fp8_act_quant", mutates_args=())
+def _fp8_act_quant(
+    x: torch.Tensor, block_size: int = 128
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.is_contiguous()
+    assert x.shape[-1] % block_size == 0
+    y = torch.empty_like(x, dtype=_FP8_DTYPE)
+    grid = (triton.cdiv(x.numel(), block_size),)
+    s = x.new_empty(*x.size()[:-1], x.size(-1) // block_size, dtype=torch.float32)
+    with device_context(x.device):
+        wrap_triton(_fp8_act_quant_kernel)[grid](x, y, s, BLOCK_SIZE=block_size)
+    return y, s
+def fp8_act_quant(
+    x: torch.Tensor, block_size: int = 128
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Quantize activations to FP8 with per-block dynamic scaling.
+    Splits the last dimension of ``x`` into blocks of ``block_size`` elements,
+    computes ``scale = max(|x_block|) / 448`` per block, and quantizes to
+    ``float8_e4m3fn``.
+    Args:
+        x: Input tensor in bf16/fp16/fp32. Last dimension must be divisible by
+            ``block_size`` and the tensor must be contiguous.
+        block_size: Number of elements per quantization block (default: 128).
+    Returns:
+        A tuple ``(quantized, scales)`` where ``quantized`` has dtype
+        ``float8_e4m3fn`` with the same shape as ``x``, and ``scales`` has
+        shape ``(*x.shape[:-1], x.shape[-1] // block_size)`` in float32.
+    """
+    return torch.ops.finegrained_fp8.fp8_act_quant(x, block_size)

build/torch-xpu/batched.py ADDED Viewed

	@@ -0,0 +1,390 @@

+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import triton
+import triton.language as tl
+from .act_quant import fp8_act_quant
+from torch.library import triton_op, wrap_triton
+from .utils import device_context
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4, 5]
+    ],
+    key=["N", "K"],
+)
+@triton.jit
+def w8a8_block_fp8_matmul_batched_kernel(
+    A,  # (S, K)  raw BF16/FP16 activations
+    B,  # (E, N, K) FP8 weight matrices
+    C,  # (S, N)  output
+    Bs,  # (E, N // BLOCK_SIZE_N, K // BLOCK_SIZE_K) weight scales
+    ExpertIds,  # (S,) — which expert each batch element routes to
+    # Shape
+    S,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_bs_e,
+    stride_bs_k,
+    stride_bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+):
+    """Block-scale batched FP8 expert matmul kernel.
+    Each program handles one routed token row and one N-tile, looks up the
+    owning expert from ``ExpertIds``, and applies fused activation quantization.
+    """
+    batch_id = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    # Cast expert_id to int64 to prevent int32 overflow when computing
+    # expert_id * stride_Eb (e.g. 255 * 9_437_184 > 2^31 for 256 experts of
+    # 3072×3072 FP8 weights).
+    expert_id = tl.load(ExpertIds + batch_id).to(tl.int64)
+    A = A + batch_id * stride_am
+    B = B + expert_id * stride_be
+    C = C + batch_id * stride_cm
+    Bs = Bs + expert_id * stride_bs_e
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + tl.arange(0, BLOCK_SIZE_M)[:, None] * 0 + offs_k[None, :] * stride_ak
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    bs_ptrs = Bs + pid_n * stride_bs_n
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # ---- fused fp8_act_quant ----
+        a_raw = tl.load(a_ptrs).to(tl.float32)
+        a_s = tl.max(tl.abs(a_raw)) / 448.0
+        a = (a_raw / tl.maximum(a_s, 1e-12)).to(tl.float8e4nv)
+        # ---- matmul ----
+        b = tl.load(b_ptrs)
+        b_s = tl.load(bs_ptrs + k * stride_bs_k)
+        accumulator += tl.dot(a, b) * a_s * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    offs_cm = tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + offs_cm[:, None] * 0 + stride_cn * offs_cn[None, :]
+    tl.store(c_ptrs, c)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4, 5]
+    ],
+    key=["N", "K"],
+)
+@triton.jit
+def w8a8_tensor_fp8_matmul_batched_kernel(
+    A,  # (S, K) pre-quantized FP8 activations
+    B,  # (E, N, K) FP8 weight matrices
+    C,  # (S, N) output
+    As,  # (S, 1) per-tensor activation scales
+    Bs,  # (E, 1, 1) per-tensor weight scales
+    ExpertIds,
+    S,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_as_m,
+    stride_bs_e,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+):
+    """Tensor-scale batched FP8 expert matmul kernel.
+    Activations are already quantized; the kernel applies per-token activation
+    scales and per-expert tensor weight scales.
+    """
+    batch_id = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    expert_id = tl.load(ExpertIds + batch_id).to(tl.int64)
+    A = A + batch_id * stride_am
+    B = B + expert_id * stride_be
+    C = C + batch_id * stride_cm
+    Bs = Bs + expert_id * stride_bs_e
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + tl.arange(0, BLOCK_SIZE_M)[:, None] * 0 + offs_k[None, :] * stride_ak
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    b_s = tl.load(Bs)
+    a_s = tl.load(As + batch_id * stride_as_m)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for _ in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs)
+        b = tl.load(b_ptrs)
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    accumulator = accumulator * a_s * b_s
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    offs_cm = tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + offs_cm[:, None] * 0 + stride_cn * offs_cn[None, :]
+    tl.store(c_ptrs, c)
+@triton_op("finegrained_fp8::w8a8_block_fp8_matmul_batched", mutates_args=())
+def _w8a8_block_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """Block-scale batched FP8 matmul: C[s] = A[s] @ B[expert_ids[s]].T, with fused act quant.
+    A:  (S, K) raw bf16/fp16 activations
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E, N // block_n, K // block_k) per-block weight scales
+    """
+    assert A.ndim == 2, f"A must be 2D (S, K), got ndim={A.ndim}"
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 3, f"B must be 3D (E, N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    assert A.shape[1] == B.shape[2], (
+        f"K mismatch: A has K={A.shape[1]}, B has K={B.shape[2]}"
+    )
+    S, K = A.shape
+    E, N, _ = B.shape
+    assert len(block_size) == 2, (
+        f"block_size must be [block_n, block_k], got {block_size}"
+    )
+    block_n, block_k = block_size[0], block_size[1]
+    # MoE expert dimensions must be block-aligned; non-aligned N/K is not supported.
+    assert N % block_n == 0, f"N ({N}) must be divisible by block_n ({block_n})"
+    assert K % block_k == 0, f"K ({K}) must be divisible by block_k ({block_k})"
+    assert Bs.ndim == 3, (
+        f"Bs must be 3D (E, N//block_n, K//block_k), got ndim={Bs.ndim}"
+    )
+    assert Bs.shape == (E, N // block_n, K // block_k), (
+        f"Bs shape {tuple(Bs.shape)} != expected ({E}, {N // block_n}, {K // block_k})"
+    )
+    C = A.new_empty(S, N)
+    # Adaptive BLOCK_SIZE_M: smallest power-of-2 >= M, floored at 16, capped at 128.
+    # Matches the WGMMA tile to the actual row count — smaller tiles use less
+    # register pressure and a better-matched FP8 WGMMA instruction, improving
+    # both accuracy and performance for small M (decode).
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2((S + E - 1) // E), 16), 128)
+    grid = (S, triton.cdiv(N, block_n))
+    with device_context(A.device):
+        wrap_triton(w8a8_block_fp8_matmul_batched_kernel)[grid](
+            A,
+            B,
+            C,
+            Bs,
+            expert_ids,
+            S,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+            Bs.stride(0),
+            Bs.stride(2),
+            Bs.stride(1),
+            BLOCK_SIZE_N=block_n,
+            BLOCK_SIZE_K=block_k,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+        )
+    return C
+@triton_op("finegrained_fp8::w8a8_tensor_fp8_matmul_batched", mutates_args=())
+def _w8a8_tensor_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+) -> torch.Tensor:
+    """Tensor-scale batched FP8 matmul: C[s] = A[s] @ B[expert_ids[s]].T, with fused act quant.
+    A:  (S, K) raw bf16/fp16 activations
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E,) or (E, 1, 1) per-expert weight scales
+    """
+    assert A.ndim == 2, f"A must be 2D (S, K), got ndim={A.ndim}"
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 3, f"B must be 3D (E, N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    assert A.shape[1] == B.shape[2], (
+        f"K mismatch: A has K={A.shape[1]}, B has K={B.shape[2]}"
+    )
+    S, K = A.shape
+    E, N, _ = B.shape
+    # Normalize Bs to (E, 1, 1)
+    if Bs.ndim == 1:
+        assert Bs.shape[0] == E, f"Bs shape {tuple(Bs.shape)} != expected ({E},)"
+        Bs = Bs.reshape(E, 1, 1)
+    else:
+        assert Bs.shape == (E, 1, 1), (
+            f"Bs shape {tuple(Bs.shape)} != expected ({E}, 1, 1)"
+        )
+    BLOCK_SIZE_N = 128
+    BLOCK_SIZE_K = 128
+    C = A.new_empty(S, N)
+    qA, As = fp8_act_quant(A, K)
+    grid = (S, triton.cdiv(N, BLOCK_SIZE_N))
+    # Adaptive BLOCK_SIZE_M: smallest power-of-2 >= M, floored at 16, capped at 128.
+    # Matches the WGMMA tile to the actual row count — smaller tiles use less
+    # register pressure and a better-matched FP8 WGMMA instruction, improving
+    # both accuracy and performance for small M (decode).
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2((S + E - 1) // E), 16), 128)
+    grid = (S, triton.cdiv(N, BLOCK_SIZE_N))
+    with device_context(A.device):
+        wrap_triton(w8a8_tensor_fp8_matmul_batched_kernel)[grid](
+            qA,
+            B,
+            C,
+            As,
+            Bs,
+            expert_ids,
+            S,
+            N,
+            K,
+            qA.stride(0),
+            qA.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+            As.stride(0),
+            Bs.stride(0),
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+        )
+    return C
+def w8a8_block_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """Block-scale batched FP8 matmul with fused activation quantization.
+    A:  (S, K) raw activations, bf16/fp16/fp32
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E, N // block_n, K // block_k) per-block weight scales
+    """
+    return torch.ops.finegrained_fp8.w8a8_block_fp8_matmul_batched(
+        A, B, Bs, expert_ids, block_size
+    )
+def w8a8_tensor_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+) -> torch.Tensor:
+    """Tensor-scale batched FP8 matmul with fused activation quantization.
+    A:  (S, K) raw activations, bf16/fp16/fp32
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E,) or (E, 1, 1) per-expert weight scales
+    """
+    return torch.ops.finegrained_fp8.w8a8_tensor_fp8_matmul_batched(
+        A, B, Bs, expert_ids
+    )
+def w8a8_fp8_matmul_batched(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    expert_ids: torch.Tensor,
+    block_size: list[int] | None,
+) -> torch.Tensor:
+    """Unified batched W8A8 FP8 matmul dispatcher.
+    Dispatch rules:
+    - tensor mode when ``block_size is None``
+    - tensor mode when ``block_size == [N, K]``
+    - otherwise block mode
+    Returns:
+        Output tensor ``[S, N]`` in the same dtype as ``A``.
+    """
+    if block_size is None or (
+        block_size[0] == B.size(1) and block_size[1] == B.size(2)
+    ):
+        return w8a8_tensor_fp8_matmul_batched(A, B, Bs, expert_ids)
+    return w8a8_block_fp8_matmul_batched(A, B, Bs, expert_ids, block_size)

build/torch-xpu/finegrained_fp8/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch-xpu/grouped.py ADDED Viewed

	@@ -0,0 +1,477 @@

+# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import triton
+import triton.language as tl
+from .act_quant import fp8_act_quant
+from torch.library import triton_op, wrap_triton
+from .utils import device_context
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4, 5]
+    ],
+    key=["N", "K", "BLOCK_SIZE_M"],
+)
+@triton.jit
+def w8a8_block_fp8_matmul_grouped_kernel(
+    A,  # (S, K)  raw BF16/FP16 activations, sorted/grouped by expert id
+    B,  # (E, N, K) FP8 weight matrices
+    C,  # (S, N)  output
+    Bs,  # (E, N // BLOCK_SIZE_N, K // BLOCK_SIZE_K) weight scales
+    Offsets,  # (E,) int32 — cumulative row-end per expert
+    TileOffsets,  # (E,) int32 — cumulative tile-end per expert
+    # Shape
+    S,
+    N,
+    K,
+    # Strides
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_bs_e,
+    stride_bs_k,
+    stride_bs_n,
+    # Meta-parameters
+    NUM_EXPERTS: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    NUM_EXPERTS_BIT_LENGTH: tl.constexpr,
+):
+    """Block-scale grouped FP8 expert matmul kernel.
+    Tokens are assumed sorted by expert. The kernel maps each M-tile to its
+    owning expert via ``TileOffsets`` and applies fused activation quantization.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    # Exit early for programs beyond the actual tile count.
+    total_tiles = tl.load(TileOffsets + NUM_EXPERTS - 1)
+    if pid_m >= total_tiles:
+        return
+    # Binary search in TileOffsets to find the owning expert.
+    # Finds the smallest e such that TileOffsets[e] > pid_m (upper_bound semantics),
+    # which is the expert whose tile range contains pid_m.
+    # O(log2(NUM_EXPERTS)) loads instead of the O(NUM_EXPERTS) linear scan.
+    # NUM_EXPERTS_BIT_LENGTH is ceil(log2(E))+1 for powers-of-two, giving one
+    # harmless extra iteration when lo==hi; it's a compile-time constant so the
+    # loop is fully unrolled by the compiler.
+    lo = 0
+    hi = NUM_EXPERTS
+    for _ in tl.static_range(NUM_EXPERTS_BIT_LENGTH):
+        mid = (lo + hi) >> 1
+        mid_val = tl.load(TileOffsets + mid)
+        is_left = mid_val <= pid_m
+        lo = tl.where(is_left, mid + 1, lo)
+        hi = tl.where(is_left, hi, mid)
+    # Cast expert_id to int64 to prevent int32 overflow when computing
+    # expert_id * stride_be (e.g. 255 * 9_437_184 > 2^31 for 256 experts of
+    # 3072×3072 FP8 weights).
+    expert_id = lo.to(tl.int64)
+    prev_eid = tl.maximum(expert_id - 1, 0)
+    expert_start = tl.where(expert_id == 0, 0, tl.load(Offsets + prev_eid))
+    expert_end = tl.load(Offsets + expert_id)
+    M_expert = expert_end - expert_start
+    expert_tile_start = tl.where(expert_id == 0, 0, tl.load(TileOffsets + prev_eid))
+    local_tile = pid_m - expert_tile_start
+    m_off = local_tile * BLOCK_SIZE_M
+    offs_am = m_off + tl.arange(0, BLOCK_SIZE_M)
+    row_mask = offs_am < M_expert
+    offs_global_m = expert_start + offs_am
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + offs_global_m[:, None] * stride_am + offs_k[None, :] * stride_ak
+    b_ptrs = (
+        B
+        + expert_id * stride_be
+        + offs_k[:, None] * stride_bk
+        + offs_bn[None, :] * stride_bn
+    )
+    bs_ptrs = Bs + expert_id * stride_bs_e + pid_n * stride_bs_n
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # ---- fused fp8_act_quant ----
+        a_raw = tl.load(a_ptrs, mask=row_mask[:, None], other=0.0).to(tl.float32)
+        a_s = tl.max(tl.abs(a_raw), axis=1) / 448.0
+        a = (a_raw / tl.maximum(a_s[:, None], 1e-12)).to(tl.float8e4nv)
+        # ---- matmul ----
+        b = tl.load(b_ptrs)
+        b_s = tl.load(bs_ptrs + k * stride_bs_k)
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    c_ptrs = C + stride_cm * offs_global_m[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = row_mask[:, None]
+    tl.store(c_ptrs, c, mask=c_mask)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4, 5]
+    ],
+    key=["N", "K", "BLOCK_SIZE_M"],
+)
+@triton.jit
+def w8a8_tensor_fp8_matmul_grouped_kernel(
+    A,  # (S, K) raw BF16/FP16 activations, sorted/grouped by expert idc
+    B,  # (E, N, K) FP8 weight matrices
+    C,  # (S, N) output
+    As,  # (S, 1) activation scales
+    Bs,  # (E, 1, 1) per-tensor weight scales
+    Offsets,
+    TileOffsets,
+    S,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_as_m,
+    stride_bs_e,
+    NUM_EXPERTS: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    NUM_EXPERTS_BIT_LENGTH: tl.constexpr,
+):
+    """Tensor-scale grouped FP8 expert matmul kernel.
+    Uses grouped expert scheduling with pre-quantized activations plus
+    per-token activation scales and per-expert tensor weight scales.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    total_tiles = tl.load(TileOffsets + NUM_EXPERTS - 1)
+    if pid_m >= total_tiles:
+        return
+    lo = 0
+    hi = NUM_EXPERTS
+    for _ in tl.static_range(NUM_EXPERTS_BIT_LENGTH):
+        mid = (lo + hi) >> 1
+        mid_val = tl.load(TileOffsets + mid)
+        is_left = mid_val <= pid_m
+        lo = tl.where(is_left, mid + 1, lo)
+        hi = tl.where(is_left, hi, mid)
+    expert_id = lo.to(tl.int64)
+    prev_eid = tl.maximum(expert_id - 1, 0)
+    expert_start = tl.where(expert_id == 0, 0, tl.load(Offsets + prev_eid))
+    expert_end = tl.load(Offsets + expert_id)
+    M_expert = expert_end - expert_start
+    expert_tile_start = tl.where(expert_id == 0, 0, tl.load(TileOffsets + prev_eid))
+    local_tile = pid_m - expert_tile_start
+    m_off = local_tile * BLOCK_SIZE_M
+    offs_am = m_off + tl.arange(0, BLOCK_SIZE_M)
+    row_mask = offs_am < M_expert
+    offs_global_m = expert_start + offs_am
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + offs_global_m[:, None] * stride_am + offs_k[None, :] * stride_ak
+    b_ptrs = (
+        B
+        + expert_id * stride_be
+        + offs_k[:, None] * stride_bk
+        + offs_bn[None, :] * stride_bn
+    )
+    a_s = tl.load(As + offs_global_m * stride_as_m, mask=row_mask, other=0.0)
+    b_s = tl.load(Bs + expert_id * stride_bs_e)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for _ in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs, mask=row_mask[:, None], other=0.0)
+        b = tl.load(b_ptrs)
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    accumulator = accumulator * a_s[:, None] * b_s
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    c_ptrs = C + stride_cm * offs_global_m[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = row_mask[:, None]
+    tl.store(c_ptrs, c, mask=c_mask)
+@triton_op("finegrained_fp8::w8a8_block_fp8_matmul_grouped", mutates_args=())
+def _w8a8_block_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """Block-scale grouped FP8 matmul: C = A @ B.T per expert, with fused act quant.
+    A:  (S, K) raw bf16/fp16 activations, sorted by expert
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E, N // block_n, K // block_k) per-block weight scales
+    """
+    assert A.ndim == 2, f"A must be 2D (S, K), got ndim={A.ndim}"
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 3, f"B must be 3D (E, N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    assert A.shape[1] == B.shape[2], (
+        f"K mismatch: A has K={A.shape[1]}, B has K={B.shape[2]}"
+    )
+    S, K = A.shape
+    E, N, _ = B.shape
+    assert len(block_size) == 2, (
+        f"block_size must be [block_n, block_k], got {block_size}"
+    )
+    block_n, block_k = block_size[0], block_size[1]
+    # MoE expert dimensions must be block-aligned; non-aligned N/K is not supported.
+    assert N % block_n == 0, f"N ({N}) must be divisible by block_n ({block_n})"
+    assert K % block_k == 0, f"K ({K}) must be divisible by block_k ({block_k})"
+    assert Bs.ndim == 3, (
+        f"Bs must be 3D (E, N//block_n, K//block_k), got ndim={Bs.ndim}"
+    )
+    assert Bs.shape == (E, N // block_n, K // block_k), (
+        f"Bs shape {tuple(Bs.shape)} != expected ({E}, {N // block_n}, {K // block_k})"
+    )
+    C = A.new_empty(S, N)
+    # Adaptive BLOCK_SIZE_M: match tile to average tokens per expert.
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2((S + E - 1) // E), 16), 128)
+    tiles_per_expert = (tokens_per_expert + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
+    tile_offsets = torch.cumsum(tiles_per_expert, dim=0).to(torch.int32)
+    # Upper bound on M-tiles: sum_e ceil(M_e / BLOCK_M) <= ceil(S / BLOCK_M) + E.
+    # Programs beyond the real tile count exit immediately via the early-return
+    # guard inside the kernel. This is faster than syncing for the exact count
+    # and keeps the grid size data-independent (cuda-graph / torch.compile safe).
+    max_M_tiles = triton.cdiv(S, BLOCK_SIZE_M) + E
+    grid = (max_M_tiles, triton.cdiv(N, block_n))
+    with device_context(A.device):
+        wrap_triton(w8a8_block_fp8_matmul_grouped_kernel)[grid](
+            A,
+            B,
+            C,
+            Bs,
+            offsets,
+            tile_offsets,
+            S,
+            N,
+            K,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+            Bs.stride(0),
+            Bs.stride(2),
+            Bs.stride(1),
+            # Meta-parameters
+            NUM_EXPERTS=E,
+            BLOCK_SIZE_N=block_n,
+            BLOCK_SIZE_K=block_k,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            NUM_EXPERTS_BIT_LENGTH=E.bit_length(),
+        )
+    return C
+@triton_op("finegrained_fp8::w8a8_tensor_fp8_matmul_grouped", mutates_args=())
+def _w8a8_tensor_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+) -> torch.Tensor:
+    """Tensor-scale grouped FP8 matmul: C = A @ B.T per expert, with fused act quant.
+    A:  (S, K) raw bf16/fp16 activations, sorted by expert
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E,) or (E, 1, 1) per-expert weight scales
+    """
+    assert A.ndim == 2, f"A must be 2D (S, K), got ndim={A.ndim}"
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 3, f"B must be 3D (E, N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    assert A.shape[1] == B.shape[2], (
+        f"K mismatch: A has K={A.shape[1]}, B has K={B.shape[2]}"
+    )
+    S, K = A.shape
+    E, N, _ = B.shape
+    # Normalize Bs to (E, 1, 1)
+    if Bs.ndim == 1:
+        assert Bs.shape[0] == E, f"Bs shape {tuple(Bs.shape)} != expected ({E},)"
+        Bs = Bs.reshape(E, 1, 1)
+    else:
+        assert Bs.shape == (E, 1, 1), (
+            f"Bs shape {tuple(Bs.shape)} != expected ({E}, 1, 1)"
+        )
+    BLOCK_SIZE_N = 128
+    BLOCK_SIZE_K = 128
+    C = A.new_empty(S, N)
+    qA, As = fp8_act_quant(A, K)
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2((S + E - 1) // E), 16), 128)
+    tiles_per_expert = (tokens_per_expert + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
+    tile_offsets = torch.cumsum(tiles_per_expert, dim=0).to(torch.int32)
+    # Upper bound on M-tiles: sum_e ceil(M_e / BLOCK_M) <= ceil(S / BLOCK_M) + E.
+    # Programs beyond the real tile count exit immediately via the early-return
+    # guard inside the kernel. This is faster than syncing for the exact count
+    # and keeps the grid size data-independent (cuda-graph / torch.compile safe).
+    max_M_tiles = triton.cdiv(S, BLOCK_SIZE_M) + E
+    grid = (max_M_tiles, triton.cdiv(N, BLOCK_SIZE_N))
+    with device_context(A.device):
+        wrap_triton(w8a8_tensor_fp8_matmul_grouped_kernel)[grid](
+            qA,
+            B,
+            C,
+            As,
+            Bs,
+            offsets,
+            tile_offsets,
+            S,
+            N,
+            K,
+            qA.stride(0),
+            qA.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(0),
+            C.stride(1),
+            As.stride(0),
+            Bs.stride(0),
+            # Meta-parameters
+            NUM_EXPERTS=E,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            NUM_EXPERTS_BIT_LENGTH=E.bit_length(),
+        )
+    return C
+def w8a8_block_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """Block-scale grouped FP8 matmul with fused activation quantization.
+    A:  (S, K) raw activations sorted by expert, bf16/fp16/fp32
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E, N // block_n, K // block_k) per-block weight scales
+    """
+    return torch.ops.finegrained_fp8.w8a8_block_fp8_matmul_grouped(
+        A, B, Bs, offsets, tokens_per_expert, block_size
+    )
+def w8a8_tensor_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+) -> torch.Tensor:
+    """Tensor-scale grouped FP8 matmul with fused activation quantization.
+    A:  (S, K) raw activations sorted by expert, bf16/fp16/fp32
+    B:  (E, N, K) FP8 expert weights
+    Bs: (E,) or (E, 1, 1) per-expert weight scales
+    """
+    return torch.ops.finegrained_fp8.w8a8_tensor_fp8_matmul_grouped(
+        A, B, Bs, offsets, tokens_per_expert
+    )
+def w8a8_fp8_matmul_grouped(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    offsets: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    block_size: list[int] | None,
+) -> torch.Tensor:
+    """Unified grouped W8A8 FP8 matmul dispatcher.
+    Dispatch rules:
+    - tensor mode when ``block_size is None``
+    - tensor mode when ``block_size == [N, K]``
+    - otherwise block mode
+    Returns:
+        Output tensor ``[S, N]`` in the same dtype as ``A``, in expert-sorted order.
+    """
+    if block_size is None or (
+        block_size[0] == B.size(1) and block_size[1] == B.size(2)
+    ):
+        return w8a8_tensor_fp8_matmul_grouped(A, B, Bs, offsets, tokens_per_expert)
+    return w8a8_block_fp8_matmul_grouped(
+        A, B, Bs, offsets, tokens_per_expert, block_size
+    )

build/torch-xpu/matmul.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import triton
+import triton.language as tl
+from torch.library import triton_op, wrap_triton
+from .utils import device_context
+# Adapted from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/layers/quantization/fp8_kernel.py
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4]
+    ],
+    key=["N", "K", "BLOCK_SIZE_M"],
+)
+@triton.jit
+def w8a8_block_fp8_matmul_kernel(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_as_m,
+    stride_as_k,
+    stride_bs_k,
+    stride_bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Block-scale FP8 GEMM kernel.
+    Computes ``C = A @ B.T`` with block-wise activation/weight scales.
+    Uses a 2D grid with swizzle for L2 cache locality on B tiles.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M)
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    as_ptrs = As + offs_am * stride_as_m
+    offs_bsn = offs_bn // BLOCK_SIZE_N
+    bs_ptrs = Bs + offs_bsn * stride_bs_n
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_remaining = K - k * BLOCK_SIZE_K
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < k_remaining, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
+        a_s = tl.load(as_ptrs + k * stride_as_k)
+        b_s = tl.load(bs_ptrs + k * stride_bs_k)
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=w, num_stages=s)
+        for w in [2, 4, 8, 16]
+        for s in [2, 3, 4]
+    ],
+    key=["N", "K", "BLOCK_SIZE_M"],
+)
+@triton.jit
+def w8a8_tensor_fp8_matmul_kernel(
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    M,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_as_m,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Tensor-scale FP8 GEMM kernel.
+    Computes ``C = A @ B.T`` with one activation scale per row and one
+    weight scale for the full matrix.
+    Uses a 2D grid with swizzle for L2 cache locality on B tiles.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    pid_m, pid_n = tl.swizzle2d(pid_m, pid_n, num_pid_m, num_pid_n, GROUP_SIZE_M)
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
+    b_ptrs = B + offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn
+    a_s = tl.load(As + offs_am * stride_as_m)
+    b_s = tl.load(Bs)
+    # Accumulate raw dot products, apply scales once after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_remaining = K - k * BLOCK_SIZE_K
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < k_remaining, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    accumulator = accumulator * a_s[:, None] * b_s
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+@triton_op("finegrained_fp8::w8a8_block_fp8_matmul", mutates_args=())
+def _w8a8_block_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Block-scale FP8 matmul: C = A @ B.T with per-block scales.
+    As: (M, K // block_k) — per-token-group activation scales
+    Bs: (N // block_n, K // block_k) — per-block weight scales
+    """
+    assert len(block_size) == 2, (
+        f"block_size must be [block_n, block_k], got {block_size}"
+    )
+    block_n, block_k = block_size[0], block_size[1]
+    assert A.shape[-1] == B.shape[-1], (
+        f"K mismatch: A has K={A.shape[-1]}, B has K={B.shape[-1]}"
+    )
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 2, f"B must be 2D (N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    N, K = B.shape
+    M = A.numel() // A.shape[-1]
+    assert As.ndim >= 2, f"As must be at least 2D, got ndim={As.ndim}"
+    assert As.shape[-1] == triton.cdiv(K, block_k), (
+        f"As last dim {As.shape[-1]} != expected {triton.cdiv(K, block_k)} (cdiv(K={K}, block_k={block_k}))"
+    )
+    assert Bs.ndim == 2, f"Bs must be 2D (N//block_n, K//block_k), got ndim={Bs.ndim}"
+    assert Bs.shape == (triton.cdiv(N, block_n), triton.cdiv(K, block_k)), (
+        f"Bs shape {tuple(Bs.shape)} != expected ({triton.cdiv(N, block_n)}, {triton.cdiv(K, block_k)})"
+    )
+    BLOCK_SIZE_K = block_k
+    BLOCK_SIZE_N = block_n
+    C_shape = A.shape[:-1] + (N,)
+    C = A.new_empty(C_shape, dtype=output_dtype)
+    # Adaptive BLOCK_SIZE_M: smallest power-of-2 >= M, floored at 16, capped at 128.
+    # Matches the WGMMA tile to the actual row count — smaller tiles use less
+    # register pressure and a better-matched FP8 WGMMA instruction, improving
+    # both accuracy and performance for small M (decode).
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2(M), 16), 128)
+    grid = (triton.cdiv(M, BLOCK_SIZE_M), triton.cdiv(N, BLOCK_SIZE_N))
+    with device_context(A.device):
+        wrap_triton(w8a8_block_fp8_matmul_kernel)[grid](
+            A,
+            B,
+            C,
+            As,
+            Bs,
+            M,
+            N,
+            K,
+            A.stride(-2),
+            A.stride(-1),
+            B.stride(1),
+            B.stride(0),
+            C.stride(-2),
+            C.stride(-1),
+            As.stride(-2),
+            As.stride(-1),
+            Bs.stride(1),
+            Bs.stride(0),
+            # Meta-parameters
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            GROUP_SIZE_M=8,
+        )
+    return C
+@triton_op("finegrained_fp8::w8a8_tensor_fp8_matmul", mutates_args=())
+def _w8a8_tensor_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Tensor-scale FP8 matmul: C = A @ B.T with per-row / per-tensor scales.
+    As: scalar, (M,), or (M, 1) — per-row activation scales
+    Bs: scalar, (1,), or (1, 1) — single weight scale
+    """
+    assert A.shape[-1] == B.shape[-1], (
+        f"K mismatch: A has K={A.shape[-1]}, B has K={B.shape[-1]}"
+    )
+    assert A.is_contiguous(), "A must be contiguous"
+    assert B.ndim == 2, f"B must be 2D (N, K), got ndim={B.ndim}"
+    assert B.is_contiguous(), "B must be contiguous"
+    N, K = B.shape
+    M = A.numel() // A.shape[-1]
+    # Normalize As to (M,)
+    if As.numel() == 1:
+        As = As.reshape(1).expand(M).contiguous()
+    elif As.ndim == 2:
+        As = As.reshape(M)
+    assert As.ndim == 1 and As.shape[0] == M, (
+        f"As must be scalar, (M,), or (M,1) with M={M}, got {tuple(As.shape)}"
+    )
+    # Normalize Bs to (1,)
+    assert Bs.numel() == 1, f"Bs must be scalar or (1,), got {tuple(Bs.shape)}"
+    Bs = Bs.reshape(1)
+    BLOCK_SIZE_N = 128
+    BLOCK_SIZE_K = 128
+    C_shape = A.shape[:-1] + (N,)
+    C = A.new_empty(C_shape, dtype=output_dtype)
+    BLOCK_SIZE_M = min(max(triton.next_power_of_2(M), 16), 128)
+    grid = (triton.cdiv(M, BLOCK_SIZE_M), triton.cdiv(N, BLOCK_SIZE_N))
+    with device_context(A.device):
+        wrap_triton(w8a8_tensor_fp8_matmul_kernel)[grid](
+            A,
+            B,
+            C,
+            As,
+            Bs,
+            M,
+            N,
+            K,
+            A.stride(-2),
+            A.stride(-1),
+            B.stride(1),
+            B.stride(0),
+            C.stride(-2),
+            C.stride(-1),
+            As.stride(0),
+            # Meta-parameters
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            GROUP_SIZE_M=8,
+        )
+    return C
+def w8a8_block_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Block-wise W8A8 FP8 matrix multiplication.
+    Computes ``C = A @ B.T`` where both operands are pre-quantized to
+    ``float8_e4m3fn`` with per-block scales, and accumulates in float32
+    before casting to ``output_dtype``.
+    Args:
+        A: Quantized activation tensor ``[M, K]`` in ``float8_e4m3fn``.
+        B: Quantized weight tensor ``[N, K]`` in ``float8_e4m3fn``.
+        As: Per-token-group activation scales ``[M, K // block_size[1]]``.
+        Bs: Per-block weight scales ``[N // block_size[0], K // block_size[1]]``.
+        block_size: ``[block_n, block_k]`` quantization block dimensions, e.g. ``[128, 128]``.
+        output_dtype: dtype of the returned tensor (default: ``torch.float32``).
+    Returns:
+        Output tensor ``[M, N]`` in ``output_dtype``.
+    """
+    return torch.ops.finegrained_fp8.w8a8_block_fp8_matmul(
+        A, B, As, Bs, block_size, output_dtype
+    )
+def w8a8_tensor_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Tensor-scale W8A8 FP8 matrix multiplication.
+    Computes ``C = A @ B.T`` in tensor-scale mode using pre-quantized FP8
+    activations/weights and tensor scales.
+    Args:
+        A: Quantized activation tensor ``[M, K]`` in ``float8_e4m3fn``.
+        B: Quantized weight tensor ``[N, K]`` in ``float8_e4m3fn``.
+        As: Per-row activation scales ``[M]``.
+        Bs: Single weight scale, scalar or ``[1]``.
+        output_dtype: dtype of the returned tensor.
+    Returns:
+        Output tensor ``[M, N]`` in ``output_dtype``.
+    """
+    return torch.ops.finegrained_fp8.w8a8_tensor_fp8_matmul(A, B, As, Bs, output_dtype)
+def w8a8_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int] | None,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Unified W8A8 FP8 matmul dispatcher.
+    Dispatch rules:
+    - tensor mode when ``block_size is None``
+    - tensor mode when ``block_size == [N, K]``
+    - otherwise block mode
+    Returns:
+        Output tensor ``[M, N]`` in ``output_dtype``.
+    """
+    if block_size is None or (
+        block_size[0] == B.size(0) and block_size[1] == B.size(1)
+    ):
+        return w8a8_tensor_fp8_matmul(A, B, As, Bs, output_dtype)
+    return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype)

build/torch-xpu/metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "xpu"
+  }
+}

build/torch-xpu/utils.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch
+from contextlib import contextmanager
+@contextmanager
+def device_context(device: torch.device):
+    """Context manager that sets the active device for any backend (cuda, xpu, etc.)."""
+    backend = getattr(torch, device.type, None)
+    if backend is not None and hasattr(backend, "device"):
+        with backend.device(device):
+            yield
+    else:
+        yield