Working CoreML LUT4 input_embeds variant (86.9% on VITW)

Browse files

Files changed (1) hide show

convert_embeds.py +221 -0

convert_embeds.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""Custom ANEMLL conversion that takes inputs_embeds instead of input_ids.
+Required for Mega-ASR: at inference we scatter audio encoder outputs at
+<|audio_pad|> positions BEFORE the LLM, then feed pre-embedded hidden_states
+to the decoder. The default ANEMLL conversion has embed_tokens baked in
+(takes input_ids); we need it bypassed.
+This script:
+  1. Loads QwenForCausalLM via ANEMLL's loader
+  2. Monkey-patches QwenModel.forward to accept an optional inputs_embeds arg
+  3. Defines a fresh Wrapper that exposes inputs_embeds as the first input
+  4. Traces + converts via ct.convert with LUT-4 palettization in postprocess
+  5. Saves the resulting .mlpackage
+Reuses ANEMLL's QwenConverter postprocessing (LUT-4 quantization, state
+declarations) by calling its methods after the inputs are swapped.
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+from pathlib import Path
+sys.path.insert(0, "/tmp/Anemll")
+import numpy as np
+import torch
+import torch.nn as nn
+import coremltools as ct
+import coremltools.optimize as cto
+# Apply the local coremltools _cast patch we made earlier (now resident in the
+# env's installed file; nothing to do here, just import).
+def patch_qwen_for_inputs_embeds():
+    """Monkey-patch QwenModel.forward + QwenForCausalLM.forward to accept inputs_embeds.
+    When the caller passes a float tensor in the input_ids slot, treat it as
+    pre-embedded hidden_states and skip embed_tokens. Also relax the strict
+    2D shape assert in QwenForCausalLM.
+    """
+    from anemll.models import qwen_model as qm
+    orig_model_forward = qm.QwenModel.forward
+    def model_forward_or_embeds(
+        self, input_ids, causal_mask, position_ids, current_pos, IN_PREFILL: bool = False,
+    ):
+        if input_ids.dtype in (torch.float16, torch.float32, torch.bfloat16):
+            hidden_states = input_ids
+            if IN_PREFILL:
+                rotary_emb = self.get_rotary_embedding_prefill(position_ids)
+            else:
+                rotary_emb = self.get_rotary_embeddings_s(current_pos)
+            hidden_states = self.process_layers(
+                hidden_states, position_ids, causal_mask,
+                current_pos, rotary_emb, start_layer=0, end_layer=None,
+                IN_PREFILL=IN_PREFILL,
+            )
+            hidden_states = self.norm(hidden_states)
+            return hidden_states
+        return orig_model_forward(self, input_ids, causal_mask, position_ids,
+                                  current_pos, IN_PREFILL=IN_PREFILL)
+    qm.QwenModel.forward = model_forward_or_embeds
+    # Also patch QwenForCausalLM.forward — it asserts input_ids must be 2D
+    # (line 1050 in qwen_model.py). For inputs_embeds (3D), skip that.
+    orig_causal_forward = qm.QwenForCausalLM.forward
+    def causal_forward_or_embeds(
+        self, input_ids, update_mask, position_ids, causal_mask, current_pos,
+        IN_PREFILL: bool = False,
+    ):
+        if input_ids.dtype in (torch.float16, torch.float32, torch.bfloat16):
+            # Pre-embedded path — call QwenModel directly, bypass the 2D assert
+            hidden_states = self.model(
+                input_ids, causal_mask, position_ids, current_pos,
+                IN_PREFILL=IN_PREFILL,
+            )
+            # Replicate the lm-head projection logic from the original forward
+            # (single-token decode case)
+            if not IN_PREFILL and current_pos is not None:
+                seq_len = hidden_states.shape[1]
+                if seq_len == 1:
+                    pos_tensor = torch.tensor([0], device=hidden_states.device, dtype=torch.long)
+                else:
+                    if isinstance(current_pos, torch.Tensor):
+                        pos_tensor = current_pos if current_pos.dim() > 0 else current_pos.unsqueeze(0)
+                    else:
+                        pos_tensor = torch.tensor([current_pos], device=hidden_states.device, dtype=torch.long)
+                hidden_states = torch.index_select(hidden_states, dim=1, index=pos_tensor)
+            # Use the same Conv2d / 16-way split as the original
+            hidden_states = hidden_states.permute(0, 2, 1).unsqueeze(2).to(qm.MODEL_DTYPE)
+            outs = tuple(
+                getattr(self, f"lm_head16_{k}")(hidden_states).squeeze(2).transpose(1, 2)
+                for k in range(1, 17)
+            )
+            return outs
+        return orig_causal_forward(
+            self, input_ids, update_mask, position_ids, causal_mask, current_pos,
+            IN_PREFILL=IN_PREFILL,
+        )
+    qm.QwenForCausalLM.forward = causal_forward_or_embeds
+    print("[patch] QwenModel + QwenForCausalLM now accept float inputs_embeds")
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", required=True, type=Path)
+    ap.add_argument("--output", required=True, type=Path,
+                    help="Output .mlpackage path")
+    ap.add_argument("--lut", type=int, default=4)
+    ap.add_argument("--per-channel", type=int, default=8)
+    ap.add_argument("--context-length", type=int, default=512)
+    ap.add_argument("--hidden-size", type=int, default=2048)
+    args = ap.parse_args()
+    patch_qwen_for_inputs_embeds()
+    from anemll.models.qwen_model import (
+        QwenForCausalLM, QwenConfig, MODEL_DTYPE, TEST_DEVICE,
+    )
+    from anemll.ane_converter import qwen_converter as qc
+    # Force CoreML mode flags
+    import anemll.models.qwen_model as qm
+    qm.ENABLE_COREML = True
+    # Load config + model
+    import json
+    cfg = json.load(open(args.model / "config.json"))
+    cfg["context_length"] = args.context_length
+    cfg["state_length"] = args.context_length
+    config = QwenConfig(**cfg)
+    model = QwenForCausalLM(config, enable_coreml=True)
+    model.load_pretrained_weights(str(args.model))
+    model.eval()
+    for p in model.parameters():
+        p.requires_grad = False
+    print(f"Model loaded: hidden={config.hidden_size}, layers={config.num_hidden_layers}")
+    # Custom wrapper taking inputs_embeds
+    class WrapperEmbeds(torch.nn.Module):
+        def __init__(self, model):
+            super().__init__()
+            self.model = model
+        def forward(self, inputs_embeds, position_ids, causal_mask, current_pos, update_mask):
+            return self.model(
+                input_ids=inputs_embeds,  # float tensor → triggers the patched path
+                update_mask=update_mask,
+                position_ids=position_ids,
+                causal_mask=causal_mask,
+                current_pos=current_pos,
+                IN_PREFILL=False,
+            )
+    wrapper = WrapperEmbeds(model).eval()
+    # Build sample inputs for tracing
+    sample_inputs_embeds = torch.zeros(
+        (1, 1, config.hidden_size), dtype=torch.float16, device=TEST_DEVICE,
+    )
+    sample_position_ids = torch.zeros((1,), dtype=torch.int32, device=TEST_DEVICE)
+    sample_causal_mask = torch.zeros(
+        (1, 1, 1, args.context_length), dtype=torch.float16, device=TEST_DEVICE,
+    )
+    sample_current_pos = torch.zeros((1,), dtype=torch.int32, device=TEST_DEVICE)
+    sample_update_mask = torch.zeros(
+        (1, 1, args.context_length, 1), dtype=torch.float16, device=TEST_DEVICE,
+    )
+    print("Tracing ...")
+    traced = torch.jit.trace(
+        wrapper,
+        (sample_inputs_embeds, sample_position_ids, sample_causal_mask,
+         sample_current_pos, sample_update_mask),
+    )
+    print("Trace done. Converting to CoreML (fp16) ...")
+    # ANEMLL declares the KV cache as a state via GetTransformerStates
+    states = qc.QwenConverter.GetTransformerStates(model, prefix="model.model.")
+    mlmodel = ct.convert(
+        traced,
+        inputs=[
+            ct.TensorType(name="inputs_embeds", shape=sample_inputs_embeds.shape, dtype=np.float16),
+            ct.TensorType(name="position_ids",  shape=sample_position_ids.shape,  dtype=np.int32),
+            ct.TensorType(name="causal_mask",   shape=sample_causal_mask.shape,   dtype=np.float16),
+            ct.TensorType(name="current_pos",   shape=sample_current_pos.shape,   dtype=np.int32),
+            ct.TensorType(name="update_mask",   shape=sample_update_mask.shape,   dtype=np.float16),
+        ],
+        outputs=[ct.TensorType(name=f"logits{i+1}", dtype=np.float16) for i in range(16)],
+        states=states,
+        minimum_deployment_target=ct.target.iOS18,
+        # fp32 compute (activations) — fp16 overflows in Qwen3-ASR's RMSNorm/attention.
+        # Matches aoiandroid's finding for the same base model.
+        compute_precision=ct.precision.FLOAT32,
+        compute_units=ct.ComputeUnit.CPU_AND_NE,
+        convert_to="mlprogram",
+        skip_model_load=True,
+    )
+    if args.lut and args.lut < 16:
+        print(f"Applying LUT-{args.lut} palettization (per_channel={args.per_channel}) ...")
+        config_palette = cto.coreml.OpPalettizerConfig(
+            nbits=args.lut, mode="kmeans",
+            granularity="per_grouped_channel", group_size=args.per_channel,
+        )
+        pal_config = cto.coreml.OptimizationConfig(global_config=config_palette)
+        mlmodel = cto.coreml.palettize_weights(mlmodel, pal_config)
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    mlmodel.save(str(args.output))
+    print(f"Saved: {args.output}")
+if __name__ == "__main__":
+    main()