Mixed 8/4 CoreML (90.6% on VITW): LUT-8 attn + LUT-4 MLP

Browse files

Files changed (1) hide show

convert_embeds_mixed.py +228 -0

convert_embeds_mixed.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""Mixed-precision CoreML convert for Qwen3-ASR LLM (input_embeds variant).
+Attention layers (q/k/v/o_proj) → LUT-8 (8-bit palettize).
+MLP layers (gate/up/down_proj) → LUT-4 (4-bit palettize).
+Everything else (norms, lm_head, embed) → kept as fp16.
+Compute precision = fp32 to avoid Qwen3-ASR RMSNorm/attention NaN.
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+import re
+from pathlib import Path
+sys.path.insert(0, "/tmp/Anemll")
+import numpy as np
+import torch
+import torch.nn as nn
+import coremltools as ct
+import coremltools.optimize as cto
+def patch_qwen_for_inputs_embeds():
+    from anemll.models import qwen_model as qm
+    orig_model_forward = qm.QwenModel.forward
+    def model_forward_or_embeds(
+        self, input_ids, causal_mask, position_ids, current_pos, IN_PREFILL: bool = False,
+    ):
+        if input_ids.dtype in (torch.float16, torch.float32, torch.bfloat16):
+            hidden_states = input_ids
+            if IN_PREFILL:
+                rotary_emb = self.get_rotary_embedding_prefill(position_ids)
+            else:
+                rotary_emb = self.get_rotary_embeddings_s(current_pos)
+            hidden_states = self.process_layers(
+                hidden_states, position_ids, causal_mask,
+                current_pos, rotary_emb, start_layer=0, end_layer=None,
+                IN_PREFILL=IN_PREFILL,
+            )
+            hidden_states = self.norm(hidden_states)
+            return hidden_states
+        return orig_model_forward(self, input_ids, causal_mask, position_ids,
+                                  current_pos, IN_PREFILL=IN_PREFILL)
+    qm.QwenModel.forward = model_forward_or_embeds
+    orig_causal_forward = qm.QwenForCausalLM.forward
+    def causal_forward_or_embeds(
+        self, input_ids, update_mask, position_ids, causal_mask, current_pos,
+        IN_PREFILL: bool = False,
+    ):
+        if input_ids.dtype in (torch.float16, torch.float32, torch.bfloat16):
+            hidden_states = self.model(
+                input_ids, causal_mask, position_ids, current_pos,
+                IN_PREFILL=IN_PREFILL,
+            )
+            if not IN_PREFILL and current_pos is not None:
+                seq_len = hidden_states.shape[1]
+                if seq_len == 1:
+                    pos_tensor = torch.tensor([0], device=hidden_states.device, dtype=torch.long)
+                else:
+                    if isinstance(current_pos, torch.Tensor):
+                        pos_tensor = current_pos if current_pos.dim() > 0 else current_pos.unsqueeze(0)
+                    else:
+                        pos_tensor = torch.tensor([current_pos], device=hidden_states.device, dtype=torch.long)
+                hidden_states = torch.index_select(hidden_states, dim=1, index=pos_tensor)
+            hidden_states = hidden_states.permute(0, 2, 1).unsqueeze(2).to(qm.MODEL_DTYPE)
+            return tuple(
+                getattr(self, f"lm_head16_{k}")(hidden_states).squeeze(2).transpose(1, 2)
+                for k in range(1, 17)
+            )
+        return orig_causal_forward(
+            self, input_ids, update_mask, position_ids, causal_mask, current_pos,
+            IN_PREFILL=IN_PREFILL,
+        )
+    qm.QwenForCausalLM.forward = causal_forward_or_embeds
+    print("[patch] QwenModel + QwenForCausalLM accept inputs_embeds")
+def select_attn_layer(op):
+    """Return True if op is in a self_attn projection (q/k/v/o_proj)."""
+    n = op.name.lower()
+    return ("self_attn" in n and any(p in n for p in ("q_proj", "k_proj", "v_proj", "o_proj")))
+def select_mlp_layer(op):
+    """Return True if op is in an MLP projection (gate/up/down_proj)."""
+    n = op.name.lower()
+    return "mlp" in n and any(p in n for p in ("gate_proj", "up_proj", "down_proj"))
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", required=True, type=Path)
+    ap.add_argument("--output", required=True, type=Path)
+    ap.add_argument("--attn-bits", type=int, default=8)
+    ap.add_argument("--mlp-bits", type=int, default=4)
+    ap.add_argument("--group-size", type=int, default=8)
+    ap.add_argument("--context-length", type=int, default=512)
+    args = ap.parse_args()
+    patch_qwen_for_inputs_embeds()
+    from anemll.models.qwen_model import (
+        QwenForCausalLM, QwenConfig, MODEL_DTYPE, TEST_DEVICE,
+    )
+    from anemll.ane_converter import qwen_converter as qc
+    import anemll.models.qwen_model as qm
+    qm.ENABLE_COREML = True
+    import json
+    cfg = json.load(open(args.model / "config.json"))
+    cfg["context_length"] = args.context_length
+    cfg["state_length"] = args.context_length
+    config = QwenConfig(**cfg)
+    model = QwenForCausalLM(config, enable_coreml=True)
+    model.load_pretrained_weights(str(args.model))
+    model.eval()
+    for p in model.parameters():
+        p.requires_grad = False
+    print(f"Model loaded: hidden={config.hidden_size}, layers={config.num_hidden_layers}")
+    class WrapperEmbeds(torch.nn.Module):
+        def __init__(self, model):
+            super().__init__()
+            self.model = model
+        def forward(self, inputs_embeds, position_ids, causal_mask, current_pos, update_mask):
+            return self.model(
+                input_ids=inputs_embeds, update_mask=update_mask,
+                position_ids=position_ids, causal_mask=causal_mask,
+                current_pos=current_pos, IN_PREFILL=False,
+            )
+    wrapper = WrapperEmbeds(model).eval()
+    sample_inputs_embeds = torch.zeros((1, 1, config.hidden_size), dtype=torch.float16, device=TEST_DEVICE)
+    sample_position_ids = torch.zeros((1,), dtype=torch.int32, device=TEST_DEVICE)
+    sample_causal_mask = torch.zeros((1, 1, 1, args.context_length), dtype=torch.float16, device=TEST_DEVICE)
+    sample_current_pos = torch.zeros((1,), dtype=torch.int32, device=TEST_DEVICE)
+    sample_update_mask = torch.zeros((1, 1, args.context_length, 1), dtype=torch.float16, device=TEST_DEVICE)
+    print("Tracing ...")
+    traced = torch.jit.trace(
+        wrapper,
+        (sample_inputs_embeds, sample_position_ids, sample_causal_mask,
+         sample_current_pos, sample_update_mask),
+    )
+    print("Converting (fp32 compute, no palettize yet) ...")
+    states = qc.QwenConverter.GetTransformerStates(model, prefix="model.model.")
+    mlmodel = ct.convert(
+        traced,
+        inputs=[
+            ct.TensorType(name="inputs_embeds", shape=sample_inputs_embeds.shape, dtype=np.float16),
+            ct.TensorType(name="position_ids",  shape=sample_position_ids.shape,  dtype=np.int32),
+            ct.TensorType(name="causal_mask",   shape=sample_causal_mask.shape,   dtype=np.float16),
+            ct.TensorType(name="current_pos",   shape=sample_current_pos.shape,   dtype=np.int32),
+            ct.TensorType(name="update_mask",   shape=sample_update_mask.shape,   dtype=np.float16),
+        ],
+        outputs=[ct.TensorType(name=f"logits{i+1}", dtype=np.float16) for i in range(16)],
+        states=states,
+        minimum_deployment_target=ct.target.iOS18,
+        compute_precision=ct.precision.FLOAT32,
+        compute_units=ct.ComputeUnit.CPU_AND_NE,
+        convert_to="mlprogram",
+        skip_model_load=True,
+    )
+    # Walk the MIL program to enumerate const-weight ops; classify by name.
+    prog = mlmodel._mil_program
+    fn = prog.functions["main"]
+    attn_op_names, mlp_op_names = [], []
+    for op in fn.operations:
+        if op.op_type != "const":
+            continue
+        n = op.name.lower()
+        # Skip tiny constants (norms, biases, indices); only target large weight matrices.
+        try:
+            arr = op.val.val
+            if hasattr(arr, "shape") and arr.ndim >= 2 and arr.size >= 64 * 64:
+                pass
+            else:
+                continue
+        except Exception:
+            continue
+        if ("self_attn" in n or "self.attn" in n) and any(p in n for p in ("q_proj", "k_proj", "v_proj", "o_proj")):
+            attn_op_names.append(op.name)
+        elif ("mlp" in n) and any(p in n for p in ("gate_proj", "up_proj", "down_proj")):
+            mlp_op_names.append(op.name)
+    print(f"Found {len(attn_op_names)} attention weight ops and {len(mlp_op_names)} MLP weight ops")
+    if not attn_op_names or not mlp_op_names:
+        print("WARN: matched zero ops — falling back to global LUT-4")
+        cfg = cto.coreml.OpPalettizerConfig(
+            nbits=args.mlp_bits, mode="kmeans",
+            granularity="per_grouped_channel", group_size=args.group_size,
+        )
+        mlmodel = cto.coreml.palettize_weights(
+            mlmodel, cto.coreml.OptimizationConfig(global_config=cfg),
+        )
+    else:
+        cfg_attn = cto.coreml.OpPalettizerConfig(
+            nbits=args.attn_bits, mode="kmeans",
+            granularity="per_grouped_channel", group_size=args.group_size,
+        )
+        cfg_mlp = cto.coreml.OpPalettizerConfig(
+            nbits=args.mlp_bits, mode="kmeans",
+            granularity="per_grouped_channel", group_size=args.group_size,
+        )
+        op_name_configs = {**{n: cfg_attn for n in attn_op_names},
+                           **{n: cfg_mlp for n in mlp_op_names}}
+        pal_cfg = cto.coreml.OptimizationConfig(op_name_configs=op_name_configs)
+        print(f"Mixed palettize: {len(attn_op_names)} ops @ LUT-{args.attn_bits}, {len(mlp_op_names)} ops @ LUT-{args.mlp_bits}, rest fp16")
+        mlmodel = cto.coreml.palettize_weights(mlmodel, pal_cfg)
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    mlmodel.save(str(args.output))
+    print(f"Saved: {args.output}")
+if __name__ == "__main__":
+    main()