update repo

Browse files

Files changed (9) hide show

configuration_llada2uni_moe.py +3 -0
convert_experts_to_fp8.py +197 -0
convert_to_fp8_blockwise.py +266 -0
decoder-turbo/config.json +4 -4
decoder-turbo/model.safetensors +3 -0
decoder/config.json +4 -4
decoder/model.safetensors +3 -0
image_tokenizer/model.safetensors +3 -0
modeling_llada2uni_moe.py +134 -4

configuration_llada2uni_moe.py CHANGED Viewed

@@ -111,6 +111,9 @@ class LLaDA2MoeConfig(PretrainedConfig):
         self.moe_intermediate_size = moe_intermediate_size
         self.first_k_dense_replace = first_k_dense_replace
         self.output_router_logits = output_router_logits
         super().__init__(
             pad_token_id=pad_token_id,

         self.moe_intermediate_size = moe_intermediate_size
         self.first_k_dense_replace = first_k_dense_replace
         self.output_router_logits = output_router_logits
+        # FP8 quantization flag — set to True to use FP8Linear for experts
+        self.use_fp8_experts = kwargs.pop("use_fp8_experts", False)
         super().__init__(
             pad_token_id=pad_token_id,

convert_experts_to_fp8.py ADDED Viewed

	@@ -0,0 +1,197 @@

+#!/usr/bin/env python3
+"""
+Convert MoE expert weights from bf16 to fp8 with block-wise quantization.
+Professional FP8 quantization following the same approach as Qwen3.5-FP8:
+- Block-wise quantization with per-block scale (weight_scale_inv)
+- Only quantize expert Linear weight tensors (gate_proj, up_proj, down_proj)
+- Keep all other weights in bf16: embedding, lm_head, routing gates, layernorms,
+  attention projections, shared experts
+- Stores weight_scale_inv alongside each quantized weight
+Usage:
+    python convert_experts_to_fp8.py \
+        --input_dir /path/to/UniLLaDA \
+        --output_dir /path/to/UniLLaDA-FP8
+Then load with:
+    model = AutoModelForCausalLM.from_pretrained(
+        output_dir, device_map="cuda", torch_dtype="bfloat16", trust_remote_code=True
+    )
+    # config.json will have use_fp8_experts=true, so experts use FP8Linear automatically
+"""
+import os
+import re
+import json
+import argparse
+from collections import OrderedDict
+import torch
+from safetensors.torch import load_file, save_file
+from tqdm import tqdm
+FP8_MAX = torch.finfo(torch.float8_e4m3fn).max
+DEFAULT_BLOCK_SIZE = 128
+def is_expert_weight(name: str) -> bool:
+    """Match routed expert weight tensors (not shared experts)."""
+    return bool(re.match(
+        r"model\.layers\.\d+\.mlp\.experts\.\d+\.(gate_proj|up_proj|down_proj)\.weight$",
+        name
+    ))
+def quantize_blockwise(tensor: torch.Tensor, block_size: int = DEFAULT_BLOCK_SIZE):
+    """Quantize a 2D weight tensor to FP8 with block-wise scaling.
+    Args:
+        tensor: Weight tensor of shape (out_features, in_features)
+        block_size: Block size for quantization (default 128)
+    Returns:
+        fp8_tensor: Quantized tensor (float8_e4m3fn), same shape as input
+        scale_inv: Per-block scale (bfloat16), shape (ceil(out/bs), ceil(in/bs))
+    """
+    assert tensor.dim() == 2
+    weight = tensor.float()
+    out_f, in_f = weight.shape
+    bs = block_size
+    n_bo = (out_f + bs - 1) // bs
+    n_bi = (in_f + bs - 1) // bs
+    # Pad for even blocking
+    pad_out = n_bo * bs - out_f
+    pad_in = n_bi * bs - in_f
+    if pad_out > 0 or pad_in > 0:
+        padded = torch.zeros(n_bo * bs, n_bi * bs, dtype=torch.float32)
+        padded[:out_f, :in_f] = weight
+    else:
+        padded = weight
+    # Reshape into blocks: (n_bo, bs, n_bi, bs) -> (n_bo, n_bi, bs, bs)
+    blocks = padded.reshape(n_bo, bs, n_bi, bs).permute(0, 2, 1, 3)
+    # Per-block absmax -> scale
+    absmax = blocks.abs().amax(dim=(-2, -1)).clamp_min(1e-12)  # (n_bo, n_bi)
+    scale = absmax / FP8_MAX
+    # Quantize
+    scale_exp = scale[:, :, None, None]  # (n_bo, n_bi, 1, 1)
+    fp8_blocks = (blocks / scale_exp).clamp(-FP8_MAX, FP8_MAX)
+    # Reshape back: (n_bo, n_bi, bs, bs) -> (n_bo, bs, n_bi, bs) -> (H, W)
+    fp8_full = fp8_blocks.permute(0, 2, 1, 3).reshape(n_bo * bs, n_bi * bs)
+    fp8_tensor = fp8_full[:out_f, :in_f].to(torch.float8_e4m3fn)
+    # scale_inv for dequantization: real_weight = fp8.to(dtype) * scale_expanded
+    scale_inv = scale.to(torch.bfloat16)
+    return fp8_tensor, scale_inv
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert UniLLaDA expert weights to FP8 (block-wise quantization)")
+    parser.add_argument("--input_dir", type=str, required=True,
+                        help="Path to original bf16 model directory")
+    parser.add_argument("--output_dir", type=str, required=True,
+                        help="Path to output FP8 model directory")
+    parser.add_argument("--block_size", type=int, default=DEFAULT_BLOCK_SIZE,
+                        help=f"Quantization block size (default: {DEFAULT_BLOCK_SIZE})")
+    args = parser.parse_args()
+    input_dir = os.path.abspath(args.input_dir)
+    output_dir = os.path.abspath(args.output_dir)
+    block_size = args.block_size
+    os.makedirs(output_dir, exist_ok=True)
+    # Load index
+    with open(os.path.join(input_dir, "model.safetensors.index.json")) as f:
+        index = json.load(f)
+    weight_map = index["weight_map"]
+    shard_to_keys = {}
+    for key, shard in weight_map.items():
+        shard_to_keys.setdefault(shard, []).append(key)
+    new_weight_map = OrderedDict()
+    stats = {"expert": 0, "other": 0, "bytes_before": 0, "bytes_after": 0}
+    # Process each shard
+    for shard_file in tqdm(sorted(shard_to_keys.keys()), desc="Converting shards"):
+        tensors = load_file(os.path.join(input_dir, shard_file), device="cpu")
+        new_tensors = OrderedDict()
+        for key in sorted(tensors.keys()):
+            tensor = tensors[key]
+            old_bytes = tensor.nelement() * tensor.element_size()
+            stats["bytes_before"] += old_bytes
+            if is_expert_weight(key):
+                fp8_tensor, scale_inv = quantize_blockwise(tensor, block_size)
+                new_tensors[key] = fp8_tensor
+                scale_key = key.replace(".weight", ".weight_scale_inv")
+                new_tensors[scale_key] = scale_inv
+                new_bytes = (fp8_tensor.nelement() * fp8_tensor.element_size() +
+                           scale_inv.nelement() * scale_inv.element_size())
+                stats["bytes_after"] += new_bytes
+                stats["expert"] += 1
+                new_weight_map[key] = shard_file
+                new_weight_map[scale_key] = shard_file
+            else:
+                new_tensors[key] = tensor
+                stats["bytes_after"] += old_bytes
+                stats["other"] += 1
+                new_weight_map[key] = shard_file
+        save_file(new_tensors, os.path.join(output_dir, shard_file))
+        del tensors, new_tensors
+    # Save new index
+    new_index = {"metadata": index.get("metadata", {}), "weight_map": dict(new_weight_map)}
+    with open(os.path.join(output_dir, "model.safetensors.index.json"), "w") as f:
+        json.dump(new_index, f, indent=2)
+    # Update config.json: add use_fp8_experts=true and quantization_config
+    with open(os.path.join(input_dir, "config.json")) as f:
+        config = json.load(f)
+    config["use_fp8_experts"] = True
+    # Note: we do NOT add quantization_config here because transformers' built-in
+    # FP8 quantizer would conflict with our custom FP8Linear class.
+    # The use_fp8_experts flag is handled by our modeling code directly.
+    with open(os.path.join(output_dir, "config.json"), "w") as f:
+        json.dump(config, f, indent=2)
+    # Symlink everything else (decoder, vae, tokenizer, code files...)
+    for fname in os.listdir(input_dir):
+        if fname.startswith("model-") and fname.endswith(".safetensors"):
+            continue
+        if fname in ("model.safetensors.index.json", "config.json"):
+            continue
+        src = os.path.join(input_dir, fname)
+        dst = os.path.join(output_dir, fname)
+        if os.path.exists(dst):
+            continue
+        os.symlink(src, dst)
+    gb_b = stats["bytes_before"] / 1024**3
+    gb_a = stats["bytes_after"] / 1024**3
+    print(f"\n{'='*60}")
+    print(f"✅ Block-wise FP8 conversion complete!")
+    print(f"   Block size: {block_size}x{block_size}")
+    print(f"   Expert tensors quantized: {stats['expert']}")
+    print(f"   Other tensors (kept bf16): {stats['other']}")
+    print(f"   Weights: {gb_b:.2f} GB → {gb_a:.2f} GB "
+          f"(saved {gb_b-gb_a:.2f} GB, -{(1-gb_a/gb_b)*100:.1f}%)")
+    print(f"   Output: {output_dir}")
+    print(f"{'='*60}")
+if __name__ == "__main__":
+    main()

convert_to_fp8_blockwise.py ADDED Viewed

	@@ -0,0 +1,266 @@

+#!/usr/bin/env python3
+"""
+Convert UniLLaDA MoE backbone weights to FP8 (block-wise quantization).
+Professional FP8 quantization following the same approach as Qwen3.5-FP8:
+- Block-wise quantization with per-block scale (weight_scale_inv)
+- Only quantize Linear weight tensors (experts, shared experts, attention projections)
+- Keep sensitive layers in bf16: embedding, lm_head, routing gates, layernorms
+- Store quantization_config in config.json for framework compatibility
+Usage:
+    python convert_to_fp8_blockwise.py \
+        --input_dir /path/to/UniLLaDA \
+        --output_dir /path/to/UniLLaDA-FP8
+The output can be loaded with the SAME modeling code (no changes needed):
+    model = AutoModelForCausalLM.from_pretrained(output_dir, ...)
+"""
+import os
+import re
+import json
+import argparse
+from collections import OrderedDict
+import torch
+from safetensors.torch import load_file, save_file
+from tqdm import tqdm
+FP8_MAX = torch.finfo(torch.float8_e4m3fn).max
+BLOCK_SIZE = 128  # quantization block size (128x128)
+def should_quantize(name: str) -> bool:
+    """Determine if a weight should be quantized to FP8.
+    Quantize: expert weights, shared expert weights, attention projections (Linear .weight)
+    Keep bf16: embedding, lm_head, gate weights, layernorm, biases, expert_bias
+    """
+    # Must be a weight tensor (not bias, not scale, not buffer)
+    if not name.endswith(".weight"):
+        return False
+    # Never quantize these
+    skip_patterns = [
+        r"word_embeddings\.weight$",        # embedding
+        r"lm_head\.weight$",                 # output head
+        r"\.gate\.weight$",                  # routing gate
+        r"layernorm\.weight$",               # QK layernorm
+        r"input_layernorm\.weight$",         # layer norm
+        r"post_attention_layernorm\.weight$", # layer norm
+        r"norm\.weight$",                    # final norm
+    ]
+    for pat in skip_patterns:
+        if re.search(pat, name):
+            return False
+    # Quantize: expert proj, shared_expert proj, attention proj
+    quantize_patterns = [
+        r"experts\.\d+\.(gate_proj|up_proj|down_proj)\.weight$",
+        r"shared_experts\.(gate_proj|up_proj|down_proj)\.weight$",
+        r"attention\.(query_key_value|dense)\.weight$",
+        r"mlp\.(gate_proj|up_proj|down_proj)\.weight$",  # dense layer (layer 0)
+    ]
+    for pat in quantize_patterns:
+        if re.search(pat, name):
+            return True
+    return False
+def quantize_tensor_blockwise(tensor: torch.Tensor, block_size: int = BLOCK_SIZE):
+    """Quantize a 2D weight tensor to FP8 with block-wise scaling.
+    Args:
+        tensor: Weight tensor of shape (out_features, in_features), dtype bf16/fp32
+        block_size: Block size for quantization (default 128)
+    Returns:
+        fp8_tensor: Quantized tensor (float8_e4m3fn)
+        scale_inv: Per-block inverse scale (bf16), shape (ceil(out/block), ceil(in/block))
+    """
+    assert tensor.dim() == 2, f"Expected 2D tensor, got {tensor.dim()}D"
+    out_features, in_features = tensor.shape
+    # Pad if needed
+    pad_out = (block_size - out_features % block_size) % block_size
+    pad_in = (block_size - in_features % block_size) % block_size
+    if pad_out > 0 or pad_in > 0:
+        padded = torch.zeros(out_features + pad_out, in_features + pad_in,
+                           dtype=torch.float32, device=tensor.device)
+        padded[:out_features, :in_features] = tensor.float()
+    else:
+        padded = tensor.float()
+    n_blocks_out = padded.shape[0] // block_size
+    n_blocks_in = padded.shape[1] // block_size
+    # Reshape into blocks
+    blocks = padded.reshape(n_blocks_out, block_size, n_blocks_in, block_size)
+    blocks = blocks.permute(0, 2, 1, 3)  # (n_out, n_in, block, block)
+    # Compute per-block absmax
+    absmax = blocks.abs().amax(dim=(-2, -1))  # (n_out, n_in)
+    # Compute scale: scale = absmax / FP8_MAX
+    # scale_inv = 1 / scale = FP8_MAX / absmax (for dequantization: real = fp8 * scale_inv)
+    # But we store scale_inv as absmax / FP8_MAX (same as Qwen convention)
+    # Actually Qwen stores: weight_scale_inv where real_weight ≈ fp8_weight * scale_inv * FP8_MAX
+    # Let's match Qwen's convention exactly:
+    # scale_inv = absmax / FP8_MAX  (so dequant = fp8 * scale_inv * FP8_MAX / FP8_MAX = fp8 * scale_inv... no)
+    # Looking at Qwen's values (~1e-4), and weight range is small (~0.03 max for 512x2048)
+    # scale_inv ≈ absmax / FP8_MAX
+    # Dequantization: real_weight = fp8_weight.float() * scale_inv_expanded
+    # This means: quantization: fp8 = clamp(weight / scale_inv, -FP8_MAX, FP8_MAX)
+    # Wait, that would make fp8 values huge...
+    # Actually the standard convention is:
+    # scale = absmax / FP8_MAX
+    # fp8 = weight / scale  (maps to [-FP8_MAX, FP8_MAX])
+    # dequant: weight = fp8 * scale
+    # scale_inv = scale (confusing naming, but that's what Qwen uses - they call it scale_inv
+    # because it's the inverse of the integer-style scale)
+    scale = absmax / FP8_MAX  # (n_out, n_in)
+    scale = scale.clamp_min(1e-12)  # avoid division by zero
+    # Quantize
+    scale_expanded = scale[:, :, None, None]  # (n_out, n_in, 1, 1)
+    fp8_blocks = (blocks / scale_expanded).clamp(-FP8_MAX, FP8_MAX)
+    # Reshape back
+    fp8_blocks = fp8_blocks.permute(0, 2, 1, 3)  # (n_out, block, n_in, block)
+    fp8_full = fp8_blocks.reshape(padded.shape[0], padded.shape[1])
+    # Trim padding
+    fp8_tensor = fp8_full[:out_features, :in_features].to(torch.float8_e4m3fn)
+    # scale_inv for dequantization: real = fp8.float() * scale_inv_expanded
+    scale_inv = scale.to(torch.bfloat16)
+    return fp8_tensor, scale_inv
+def build_modules_to_not_convert(weight_map: dict) -> list:
+    """Build the modules_to_not_convert list from weight map."""
+    not_convert = set()
+    for key in weight_map.keys():
+        if not should_quantize(key):
+            # Extract module name (remove .weight suffix)
+            module_name = key.rsplit(".weight", 1)[0] if key.endswith(".weight") else key.rsplit(".", 1)[0]
+            not_convert.add(module_name)
+    return sorted(not_convert)
+def main():
+    parser = argparse.ArgumentParser(description="Convert UniLLaDA to FP8 (block-wise)")
+    parser.add_argument("--input_dir", type=str, required=True,
+                       help="Path to original bf16 model directory")
+    parser.add_argument("--output_dir", type=str, required=True,
+                       help="Path to output FP8 model directory")
+    parser.add_argument("--block_size", type=int, default=128,
+                       help="Quantization block size (default: 128)")
+    args = parser.parse_args()
+    input_dir = os.path.abspath(args.input_dir)
+    output_dir = os.path.abspath(args.output_dir)
+    block_size = args.block_size
+    os.makedirs(output_dir, exist_ok=True)
+    # Load index
+    with open(os.path.join(input_dir, "model.safetensors.index.json")) as f:
+        index = json.load(f)
+    weight_map = index["weight_map"]
+    shard_to_keys = {}
+    for key, shard in weight_map.items():
+        shard_to_keys.setdefault(shard, []).append(key)
+    new_weight_map = OrderedDict()
+    stats = {"quantized": 0, "kept_bf16": 0, "bytes_before": 0, "bytes_after": 0}
+    # Process each shard
+    for shard_file in tqdm(sorted(shard_to_keys.keys()), desc="Converting shards"):
+        tensors = load_file(os.path.join(input_dir, shard_file), device="cpu")
+        new_tensors = OrderedDict()
+        for key in sorted(tensors.keys()):
+            tensor = tensors[key]
+            old_bytes = tensor.nelement() * tensor.element_size()
+            stats["bytes_before"] += old_bytes
+            if should_quantize(key) and tensor.dim() == 2:
+                fp8_tensor, scale_inv = quantize_tensor_blockwise(tensor, block_size)
+                new_tensors[key] = fp8_tensor
+                scale_key = key.replace(".weight", ".weight_scale_inv")
+                new_tensors[scale_key] = scale_inv
+                new_bytes = fp8_tensor.nelement() * fp8_tensor.element_size() + \
+                           scale_inv.nelement() * scale_inv.element_size()
+                stats["bytes_after"] += new_bytes
+                stats["quantized"] += 1
+                new_weight_map[key] = shard_file
+                new_weight_map[scale_key] = shard_file
+            else:
+                new_tensors[key] = tensor
+                stats["bytes_after"] += old_bytes
+                stats["kept_bf16"] += 1
+                new_weight_map[key] = shard_file
+        save_file(new_tensors, os.path.join(output_dir, shard_file))
+        del tensors, new_tensors
+    # Save new index
+    new_index = {"metadata": index.get("metadata", {}), "weight_map": dict(new_weight_map)}
+    with open(os.path.join(output_dir, "model.safetensors.index.json"), "w") as f:
+        json.dump(new_index, f, indent=2)
+    # Build quantization config (following Qwen's format)
+    not_convert_modules = build_modules_to_not_convert(weight_map)
+    # Load and modify config.json
+    with open(os.path.join(input_dir, "config.json")) as f:
+        config = json.load(f)
+    config["quantization_config"] = {
+        "quant_method": "fp8",
+        "activation_scheme": "dynamic",
+        "weight_per_tensor": False,
+        "act_per_tensor": False,
+        "weight_block_size": [block_size, block_size],
+        "modules_to_not_convert": not_convert_modules
+    }
+    with open(os.path.join(output_dir, "config.json"), "w") as f:
+        json.dump(config, f, indent=2)
+    # Symlink everything else (code files, tokenizer, decoder, vae, etc.)
+    for fname in os.listdir(input_dir):
+        if fname.startswith("model-") and fname.endswith(".safetensors"):
+            continue
+        if fname in ("model.safetensors.index.json", "config.json"):
+            continue
+        src = os.path.join(input_dir, fname)
+        dst = os.path.join(output_dir, fname)
+        if os.path.exists(dst):
+            continue
+        os.symlink(src, dst)
+    # Print summary
+    gb_b = stats["bytes_before"] / 1024**3
+    gb_a = stats["bytes_after"] / 1024**3
+    print(f"\n{'='*60}")
+    print(f"✅ Block-wise FP8 conversion complete!")
+    print(f"   Block size: {block_size}x{block_size}")
+    print(f"   Quantized tensors: {stats['quantized']}")
+    print(f"   Kept bf16 tensors: {stats['kept_bf16']}")
+    print(f"   Weights: {gb_b:.2f} GB → {gb_a:.2f} GB (saved {gb_b-gb_a:.2f} GB, -{(1-gb_a/gb_b)*100:.1f}%)")
+    print(f"   Output: {output_dir}")
+    print(f"{'='*60}")
+if __name__ == "__main__":
+    main()

decoder-turbo/config.json CHANGED Viewed

@@ -13,11 +13,11 @@
     48
   ],
   "axes_lens": [
-    1536,
-    512,
-    512
   ],
-  "cap_feat_dim": 2560,
   "dim": 3840,
   "in_channels": 16,
   "n_heads": 30,

     48
   ],
   "axes_lens": [
+    32768,
+    1024,
+    1024
   ],
+  "cap_feat_dim": 4096,
   "dim": 3840,
   "in_channels": 16,
   "n_heads": 30,

decoder-turbo/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7589be2b548a3c1ef7e81431781e79c216ff3ae6e4f84c91397feeefef7d36dc
+size 6160866440

decoder/config.json CHANGED Viewed

@@ -13,11 +13,11 @@
     48
   ],
   "axes_lens": [
-    1536,
-    512,
-    512
   ],
-  "cap_feat_dim": 2560,
   "dim": 3840,
   "in_channels": 16,
   "n_heads": 30,

     48
   ],
   "axes_lens": [
+    32768,
+    1024,
+    1024
   ],
+  "cap_feat_dim": 4096,
   "dim": 3840,
   "in_channels": 16,
   "n_heads": 30,

decoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ace13533ec063e0a1edb1b9819546e6b5bf79f23cf759aece3d0bccfd7f62933
+size 6160866440

image_tokenizer/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0a11a82ad221ac1f3b917abfce31ffaaec3571200ae7ee5318a223ff2eedc49
+size 2398968416

modeling_llada2uni_moe.py CHANGED Viewed

@@ -339,21 +339,129 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed, k_embed
 class LLaDA2MoeMLP(nn.Module):
-    def __init__(self, config: LLaDA2MoeConfig, intermediate_size: int):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
         self.intermediate_size = intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, x):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 class LLaDA2MoeGate(nn.Module):
     def __init__(self, config):
@@ -446,16 +554,24 @@ class LLaDA2MoeSparseMoeBlock(nn.Module):
             )
     def _setup_experts(self):
         self.experts = nn.ModuleList(
             [
                 LLaDA2MoeMLP(
                     config=self.config,
                     intermediate_size=self.config.moe_intermediate_size,
                 )
                 for _ in range(self.config.num_experts)
             ]
         )
     def forward(self, hidden_states):
         identity = hidden_states
         bsz, seq_len, h = hidden_states.shape
@@ -1109,6 +1225,20 @@ class LLaDA2MoeModelLM(LLaDA2MoePreTrainedModel, GenerationMixin):
     def set_decoder(self, decoder):
         self.model = decoder
     def get_decoder(self):
         return self.model

     return q_embed, k_embed
+class FP8Linear(nn.Module):
+    """Drop-in replacement for nn.Linear that stores weights in float8_e4m3fn.
+    The weight is kept as ``float8_e4m3fn`` on GPU.  During ``forward`` it is
+    dequantized back to the compute dtype (bf16/fp16) on-the-fly.
+    Supports two modes:
+    - **Per-tensor** (legacy): no scale stored; direct cast ``fp8 → compute_dtype``.
+      Works when weight magnitudes are well within fp8 range (±448).
+    - **Block-wise** (recommended): a ``weight_scale_inv`` buffer of shape
+      ``(ceil(out/block), ceil(in/block))`` stores per-block scales.
+      Dequantization: ``real_weight = fp8_weight * scale_expanded``.
+    This halves the GPU memory for expert weights — no custom CUDA kernel needed.
+    """
+    def __init__(self, in_features: int, out_features: int, bias: bool = False,
+                 block_size: int = 128):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.block_size = block_size
+        # Placeholder – will be overwritten by state-dict loading
+        self.weight = nn.Parameter(
+            torch.empty(out_features, in_features, dtype=torch.float8_e4m3fn),
+            requires_grad=False,
+        )
+        # Optional block-wise scale — stored as a Parameter so from_pretrained can load it
+        n_bo = (out_features + block_size - 1) // block_size
+        n_bi = (in_features + block_size - 1) // block_size
+        self.weight_scale_inv = nn.Parameter(
+            torch.empty(n_bo, n_bi, dtype=torch.bfloat16), requires_grad=False
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(out_features))
+        else:
+            self.bias = None
+    def _dequantize_weight(self, dtype: torch.dtype) -> torch.Tensor:
+        """Dequantize fp8 weight to the given compute dtype."""
+        w = self.weight.to(dtype)
+        # Block-wise dequantization
+        scale = self.weight_scale_inv.to(dtype)  # (n_blocks_out, n_blocks_in)
+        bs = self.block_size
+        n_bo, n_bi = scale.shape
+        # Expand scale to match weight shape via repeat_interleave
+        scale_expanded = scale.repeat_interleave(bs, dim=0).repeat_interleave(bs, dim=1)
+        # Trim to actual weight shape (in case of padding during quantization)
+        scale_expanded = scale_expanded[:self.out_features, :self.in_features]
+        return w * scale_expanded
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self._dequantize_weight(x.dtype), self.bias)
+    @classmethod
+    def from_linear(cls, linear: nn.Linear, block_size: int = 128) -> "FP8Linear":
+        """Convert a regular nn.Linear to FP8Linear with block-wise quantization."""
+        fp8_mod = cls(linear.in_features, linear.out_features,
+                      bias=linear.bias is not None, block_size=block_size)
+        weight = linear.weight.data.float()
+        out_f, in_f = weight.shape
+        bs = block_size
+        # Compute block-wise scale
+        n_bo = (out_f + bs - 1) // bs
+        n_bi = (in_f + bs - 1) // bs
+        fp8_max = torch.finfo(torch.float8_e4m3fn).max
+        # Pad weight for even blocking
+        pad_out = n_bo * bs - out_f
+        pad_in = n_bi * bs - in_f
+        if pad_out > 0 or pad_in > 0:
+            padded = torch.zeros(n_bo * bs, n_bi * bs, dtype=torch.float32)
+            padded[:out_f, :in_f] = weight
+        else:
+            padded = weight
+        blocks = padded.reshape(n_bo, bs, n_bi, bs).permute(0, 2, 1, 3)
+        absmax = blocks.abs().amax(dim=(-2, -1)).clamp_min(1e-12)  # (n_bo, n_bi)
+        scale = absmax / fp8_max
+        # Quantize
+        scale_exp = scale[:, :, None, None]
+        fp8_blocks = (blocks / scale_exp).clamp(-fp8_max, fp8_max)
+        fp8_full = fp8_blocks.permute(0, 2, 1, 3).reshape(n_bo * bs, n_bi * bs)
+        fp8_weight = fp8_full[:out_f, :in_f].to(torch.float8_e4m3fn)
+        fp8_mod.weight = nn.Parameter(fp8_weight, requires_grad=False)
+        fp8_mod.weight_scale_inv = nn.Parameter(scale.to(torch.bfloat16), requires_grad=False)
+        if linear.bias is not None:
+            fp8_mod.bias = nn.Parameter(linear.bias.data.clone())
+        return fp8_mod
+    def extra_repr(self) -> str:
+        has_scale = self.weight_scale_inv.numel() > 0
+        return (f"in_features={self.in_features}, out_features={self.out_features}, "
+                f"bias={self.bias is not None}, dtype=float8_e4m3fn, "
+                f"block_scale={'yes' if has_scale else 'no'}")
 class LLaDA2MoeMLP(nn.Module):
+    def __init__(self, config: LLaDA2MoeConfig, intermediate_size: int, use_fp8: bool = False):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
         self.intermediate_size = intermediate_size
+        linear_cls = FP8Linear if use_fp8 else nn.Linear
+        self.gate_proj = linear_cls(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = linear_cls(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = linear_cls(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, x):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+    def to_fp8(self) -> "LLaDA2MoeMLP":
+        """Convert all Linear layers in this MLP to FP8Linear (in-place)."""
+        self.gate_proj = FP8Linear.from_linear(self.gate_proj)
+        self.up_proj = FP8Linear.from_linear(self.up_proj)
+        self.down_proj = FP8Linear.from_linear(self.down_proj)
+        return self
 class LLaDA2MoeGate(nn.Module):
     def __init__(self, config):
             )
     def _setup_experts(self):
+        use_fp8 = getattr(self.config, "use_fp8_experts", False)
         self.experts = nn.ModuleList(
             [
                 LLaDA2MoeMLP(
                     config=self.config,
                     intermediate_size=self.config.moe_intermediate_size,
+                    use_fp8=use_fp8,
                 )
                 for _ in range(self.config.num_experts)
             ]
         )
+    def convert_experts_to_fp8(self):
+        """Convert all routed experts to FP8 in-place (call after loading bf16 weights)."""
+        for expert in self.experts:
+            expert.to_fp8()
+        return self
     def forward(self, hidden_states):
         identity = hidden_states
         bsz, seq_len, h = hidden_states.shape
     def set_decoder(self, decoder):
         self.model = decoder
+    def convert_experts_to_fp8(self):
+        """Convert all routed MoE experts to FP8 storage (in-place).
+        Call this after ``from_pretrained`` to halve expert memory::
+            model = AutoModelForCausalLM.from_pretrained(...)
+            model.convert_experts_to_fp8()
+        """
+        for layer in self.model.layers:
+            if hasattr(layer.mlp, "convert_experts_to_fp8"):
+                layer.mlp.convert_experts_to_fp8()
+        torch.cuda.empty_cache()
+        return self
     def get_decoder(self):
         return self.model