parakeet-realtime-eou-120m-coreml

aoiandroid

alexwengg commited on 5 days ago

Commit

1b8ea0e

0 Parent(s):

Duplicate from FluidInference/parakeet-realtime-eou-120m-coreml

Browse files

Co-authored-by: Alex Weng <alexwengg@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +35 -0
1280ms/.DS_Store +0 -0
1280ms/convert_parakeet_eou.py +740 -0
1280ms/convert_streaming_encoder.py +193 -0
1280ms/decoder.mlmodelc/analytics/coremldata.bin +3 -0
1280ms/decoder.mlmodelc/coremldata.bin +3 -0
1280ms/decoder.mlmodelc/metadata.json +118 -0
1280ms/decoder.mlmodelc/model.mil +45 -0
1280ms/decoder.mlmodelc/weights/weight.bin +3 -0
1280ms/individual_components.py +250 -0
1280ms/joint_decision.mlmodelc/analytics/coremldata.bin +3 -0
1280ms/joint_decision.mlmodelc/coremldata.bin +3 -0
1280ms/joint_decision.mlmodelc/metadata.json +112 -0
1280ms/joint_decision.mlmodelc/model.mil +57 -0
1280ms/joint_decision.mlmodelc/weights/weight.bin +3 -0
1280ms/parakeet_eou_preprocessor.mlmodelc/analytics/coremldata.bin +3 -0
1280ms/parakeet_eou_preprocessor.mlmodelc/coremldata.bin +3 -0
1280ms/parakeet_eou_preprocessor.mlmodelc/metadata.json +105 -0
1280ms/parakeet_eou_preprocessor.mlmodelc/model.mil +96 -0
1280ms/parakeet_eou_preprocessor.mlmodelc/weights/weight.bin +3 -0
1280ms/streaming_encoder.mlmodelc/analytics/coremldata.bin +3 -0
1280ms/streaming_encoder.mlmodelc/coremldata.bin +3 -0
1280ms/streaming_encoder.mlmodelc/metadata.json +187 -0
1280ms/streaming_encoder.mlmodelc/model.mil +0 -0
1280ms/streaming_encoder.mlmodelc/weights/weight.bin +3 -0
1280ms/streaming_encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
1280ms/streaming_encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
1280ms/streaming_encoder.mlpackage/Manifest.json +18 -0
1280ms/vocab.json +1028 -0
160ms/.DS_Store +0 -0
160ms/convert_parakeet_eou.py +740 -0
160ms/convert_streaming_encoder.py +193 -0
160ms/decoder.mlmodelc/analytics/coremldata.bin +3 -0
160ms/decoder.mlmodelc/coremldata.bin +3 -0
160ms/decoder.mlmodelc/metadata.json +118 -0
160ms/decoder.mlmodelc/model.mil +45 -0
160ms/decoder.mlmodelc/weights/weight.bin +3 -0
160ms/decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
160ms/decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
160ms/decoder.mlpackage/Manifest.json +18 -0
160ms/individual_components.py +250 -0
160ms/joint_decision.mlmodelc/analytics/coremldata.bin +3 -0
160ms/joint_decision.mlmodelc/coremldata.bin +3 -0
160ms/joint_decision.mlmodelc/metadata.json +112 -0
160ms/joint_decision.mlmodelc/model.mil +57 -0
160ms/joint_decision.mlmodelc/weights/weight.bin +3 -0
160ms/joint_decision.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
160ms/joint_decision.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
160ms/joint_decision.mlpackage/Manifest.json +18 -0
160ms/parakeet_eou_preprocessor.mlmodelc/analytics/coremldata.bin +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

1280ms/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

1280ms/convert_parakeet_eou.py ADDED Viewed

	@@ -0,0 +1,740 @@

+#!/usr/bin/env python3
+"""CLI for exporting Parakeet Realtime EOU 120M components to CoreML.
+This model is a cache-aware streaming FastConformer-RNNT model optimized for
+low-latency speech recognition with end-of-utterance detection.
+Key differences from Parakeet TDT v3:
+- Smaller model (120M vs 600M params)
+- No duration outputs (standard RNNT, not TDT)
+- Cache-aware streaming encoder (17 layers, attention context [70,1])
+- Special <EOU> token for end-of-utterance detection
+- Optimized for 80-160ms latency
+Reference: https://huggingface.co/nvidia/parakeet_realtime_eou_120m-v1
+"""
+from __future__ import annotations
+import json
+from dataclasses import asdict
+from pathlib import Path
+from typing import Dict, Optional, Tuple
+import coremltools as ct
+import numpy as np
+import soundfile as sf
+import torch
+import typer
+import nemo.collections.asr as nemo_asr
+from individual_components import (
+    DecoderWrapper,
+    EncoderWrapper,
+    ExportSettings,
+    JointWrapper,
+    JointDecisionWrapper,
+    JointDecisionSingleStep,
+    PreprocessorWrapper,
+    MelEncoderWrapper,
+    _coreml_convert,
+)
+def apply_stft_patch():
+    # Monkey patch coremltools.stft to handle extra arguments from newer torch versions
+    try:
+        import coremltools.converters.mil.frontend.torch.ops as torch_ops
+        _original_stft = torch_ops.stft
+        def patched_stft(context, node):
+            if len(node.inputs) > 8:
+                node.inputs = node.inputs[:8]
+            return _original_stft(context, node)
+        torch_ops.stft = patched_stft
+        if "stft" in torch_ops._TORCH_OPS_REGISTRY:
+            torch_ops._TORCH_OPS_REGISTRY["stft"] = patched_stft
+        print("Monkey patched coremltools.stft for compatibility.")
+    except Exception as e:
+        print(f"Warning: Could not monkey patch stft: {e}")
+DEFAULT_MODEL_ID = "nvidia/parakeet_realtime_eou_120m-v1"
+AUTHOR = "Fluid Inference"
+def _compute_length(seconds: float, sample_rate: int) -> int:
+    return int(round(seconds * sample_rate))
+def _prepare_audio(
+    validation_audio: Optional[Path],
+    sample_rate: int,
+    max_samples: int,
+    seed: Optional[int],
+) -> torch.Tensor:
+    if validation_audio is None:
+        if seed is not None:
+            torch.manual_seed(seed)
+        audio = torch.randn(1, max_samples, dtype=torch.float32)
+        return audio
+    data, sr = sf.read(str(validation_audio), dtype="float32")
+    if sr != sample_rate:
+        raise typer.BadParameter(
+            f"Validation audio sample rate {sr} does not match model rate {sample_rate}"
+        )
+    if data.ndim > 1:
+        data = data[:, 0]
+    if data.size == 0:
+        raise typer.BadParameter("Validation audio is empty")
+    if data.size < max_samples:
+        pad_width = max_samples - data.size
+        data = np.pad(data, (0, pad_width))
+    elif data.size > max_samples:
+        data = data[:max_samples]
+    audio = torch.from_numpy(data).unsqueeze(0).to(dtype=torch.float32)
+    return audio
+def _save_mlpackage(model: ct.models.MLModel, path: Path, description: str) -> None:
+    try:
+        model.minimum_deployment_target = ct.target.iOS17
+    except Exception:
+        pass
+    model.short_description = description
+    model.author = AUTHOR
+    path.parent.mkdir(parents=True, exist_ok=True)
+    model.save(str(path))
+def _tensor_shape(tensor: torch.Tensor) -> Tuple[int, ...]:
+    return tuple(int(dim) for dim in tensor.shape)
+def _parse_compute_units(name: str) -> ct.ComputeUnit:
+    """Parse a human-friendly compute units string into ct.ComputeUnit."""
+    normalized = str(name).strip().upper()
+    mapping = {
+        "ALL": ct.ComputeUnit.ALL,
+        "CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
+        "CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
+        "CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
+        "CPU_AND_NEURALENGINE": ct.ComputeUnit.CPU_AND_NE,
+    }
+    if normalized not in mapping:
+        raise typer.BadParameter(
+            f"Unknown compute units '{name}'. Choose from: " + ", ".join(mapping.keys())
+        )
+    return mapping[normalized]
+def _parse_compute_precision(name: Optional[str]) -> Optional[ct.precision]:
+    """Parse compute precision string into ct.precision or None."""
+    if name is None:
+        return None
+    normalized = str(name).strip().upper()
+    if normalized == "":
+        return None
+    mapping = {
+        "FLOAT32": ct.precision.FLOAT32,
+        "FLOAT16": ct.precision.FLOAT16,
+    }
+    if normalized not in mapping:
+        raise typer.BadParameter(
+            f"Unknown compute precision '{name}'. Choose from: "
+            + ", ".join(mapping.keys())
+        )
+    return mapping[normalized]
+app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
+@app.command()
+def convert(
+    nemo_path: Optional[Path] = typer.Option(
+        None,
+        "--nemo-path",
+        exists=True,
+        resolve_path=True,
+        help="Path to parakeet_realtime_eou_120m-v1.nemo checkpoint (skip to auto-download)",
+    ),
+    model_id: str = typer.Option(
+        DEFAULT_MODEL_ID,
+        "--model-id",
+        help="Model identifier to download when --nemo-path is omitted",
+    ),
+    output_dir: Path = typer.Option(
+        Path("parakeet_eou_coreml"),
+        help="Directory where mlpackages and metadata will be written",
+    ),
+    preprocessor_cu: str = typer.Option(
+        "CPU_ONLY",
+        "--preprocessor-cu",
+        help="Compute units for preprocessor (default CPU_ONLY)",
+    ),
+    mel_encoder_cu: str = typer.Option(
+        "CPU_ONLY",
+        "--mel-encoder-cu",
+        help="Compute units for fused mel+encoder (default CPU_ONLY)",
+    ),
+    compute_precision: Optional[str] = typer.Option(
+        None,
+        "--compute-precision",
+        help="Export precision: FLOAT32 (default) or FLOAT16 to shrink non-quantized weights.",
+    ),
+    max_audio_seconds: float = typer.Option(
+        15.0,
+        "--max-audio-seconds",
+        help="Maximum audio duration in seconds for the fixed window export",
+    ),
+    validation_audio: Optional[Path] = typer.Option(
+        None,
+        "--validation-audio",
+        exists=True,
+        resolve_path=True,
+        help="Path to a 16kHz WAV file for tracing (uses random if not provided)",
+    ),
+) -> None:
+    """Export all Parakeet Realtime EOU sub-modules to CoreML.
+    This exports the cache-aware streaming FastConformer-RNNT model for
+    low-latency speech recognition with end-of-utterance detection.
+    """
+    export_settings = ExportSettings(
+        output_dir=output_dir,
+        compute_units=ct.ComputeUnit.CPU_ONLY,
+        deployment_target=ct.target.iOS17,
+        compute_precision=_parse_compute_precision(compute_precision),
+        max_audio_seconds=max_audio_seconds,
+        max_symbol_steps=1,
+    )
+    typer.echo("Export configuration:")
+    typer.echo(asdict(export_settings))
+    output_dir.mkdir(parents=True, exist_ok=True)
+    pre_cu = _parse_compute_units(preprocessor_cu)
+    melenc_cu = _parse_compute_units(mel_encoder_cu)
+    if nemo_path is not None:
+        typer.echo(f"Loading NeMo model from {nemo_path}…")
+        # Try loading as generic ASRModel first, then specific class
+        try:
+            asr_model = nemo_asr.models.ASRModel.restore_from(
+                str(nemo_path), map_location="cpu"
+            )
+        except Exception:
+            # Fallback to EncDecRNNTBPEModel
+            asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
+                str(nemo_path), map_location="cpu"
+            )
+        checkpoint_meta = {
+            "type": "file",
+            "path": str(nemo_path),
+        }
+    else:
+        typer.echo(f"Downloading NeMo model via {model_id}…")
+        # Use ASRModel.from_pretrained as recommended for this model
+        try:
+            asr_model = nemo_asr.models.ASRModel.from_pretrained(
+                model_id, map_location="cpu"
+            )
+        except Exception:
+            # Fallback to EncDecRNNTBPEModel
+            asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
+                model_id, map_location="cpu"
+            )
+        checkpoint_meta = {
+            "type": "pretrained",
+            "model_id": model_id,
+        }
+    asr_model.eval()
+    # Print model info
+    typer.echo(f"Model class: {type(asr_model).__name__}")
+    typer.echo(f"Encoder class: {type(asr_model.encoder).__name__}")
+    sample_rate = int(asr_model.cfg.preprocessor.sample_rate)
+    max_samples = _compute_length(export_settings.max_audio_seconds, sample_rate)
+    # Prepare audio for tracing
+    if validation_audio is not None:
+        typer.echo(f"Using validation audio: {validation_audio}")
+        audio_tensor = _prepare_audio(validation_audio, sample_rate, max_samples, seed=None)
+    else:
+        typer.echo("Using random audio for tracing (seed=42)")
+        audio_tensor = _prepare_audio(None, sample_rate, max_samples, seed=42)
+    audio_length = torch.tensor([max_samples], dtype=torch.int32)
+    preprocessor = PreprocessorWrapper(asr_model.preprocessor.eval())
+    encoder = EncoderWrapper(asr_model.encoder.eval())
+    decoder = DecoderWrapper(asr_model.decoder.eval())
+    joint = JointWrapper(asr_model.joint.eval())
+    decoder_export_flag = getattr(asr_model.decoder, "_rnnt_export", False)
+    asr_model.decoder._rnnt_export = True
+    try:
+        with torch.no_grad():
+            mel_ref, mel_length_ref = preprocessor(audio_tensor, audio_length)
+            mel_length_ref = mel_length_ref.to(dtype=torch.int32)
+            encoder_ref, encoder_length_ref, frame_times_ref = encoder(
+                mel_ref, mel_length_ref
+            )
+            encoder_length_ref = encoder_length_ref.to(dtype=torch.int32)
+            # Clone tensors to drop inference flags
+            mel_ref = mel_ref.clone().detach()
+            mel_length_ref = mel_length_ref.clone().detach()
+            encoder_ref = encoder_ref.clone().detach()
+            encoder_length_ref = encoder_length_ref.clone().detach()
+            frame_times_ref = frame_times_ref.clone().detach()
+        vocab_size = int(asr_model.tokenizer.vocab_size)
+        decoder_hidden = int(asr_model.decoder.pred_hidden)
+        decoder_layers = int(asr_model.decoder.pred_rnn_layers)
+        # Check if model has extra outputs (TDT-style duration)
+        num_extra = getattr(asr_model.joint, "num_extra_outputs", 0)
+        typer.echo(f"Vocab size: {vocab_size}, num_extra_outputs: {num_extra}")
+        targets = torch.full(
+            (1, export_settings.max_symbol_steps),
+            fill_value=asr_model.decoder.blank_idx,
+            dtype=torch.int32,
+        )
+        target_lengths = torch.tensor(
+            [export_settings.max_symbol_steps], dtype=torch.int32
+        )
+        zero_state = torch.zeros(
+            decoder_layers,
+            1,
+            decoder_hidden,
+            dtype=torch.float32,
+        )
+        with torch.no_grad():
+            decoder_ref, h_ref, c_ref = decoder(
+                targets, target_lengths, zero_state, zero_state
+            )
+            joint_ref = joint(encoder_ref, decoder_ref)
+        decoder_ref = decoder_ref.clone()
+        h_ref = h_ref.clone()
+        c_ref = c_ref.clone()
+        joint_ref = joint_ref.clone()
+        typer.echo(f"Encoder output shape: {encoder_ref.shape}")
+        typer.echo(f"Decoder output shape: {decoder_ref.shape}")
+        typer.echo(f"Joint output shape: {joint_ref.shape}")
+        # === Export Preprocessor ===
+        typer.echo("Tracing and converting preprocessor…")
+        preprocessor = preprocessor.cpu()
+        audio_tensor = audio_tensor.cpu()
+        audio_length = audio_length.cpu()
+        traced_preprocessor = torch.jit.trace(
+            preprocessor, (audio_tensor, audio_length), strict=False
+        )
+        traced_preprocessor.eval()
+        preprocessor_inputs = [
+            ct.TensorType(
+                name="audio_signal",
+                shape=(1, ct.RangeDim(1, max_samples)),
+                dtype=np.float32,
+            ),
+            ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
+        ]
+        preprocessor_outputs = [
+            ct.TensorType(name="mel", dtype=np.float32),
+            ct.TensorType(name="mel_length", dtype=np.int32),
+        ]
+        preprocessor_model = _coreml_convert(
+            traced_preprocessor,
+            preprocessor_inputs,
+            preprocessor_outputs,
+            export_settings,
+            compute_units_override=pre_cu,
+        )
+        preprocessor_path = output_dir / "parakeet_eou_preprocessor.mlpackage"
+        _save_mlpackage(
+            preprocessor_model,
+            preprocessor_path,
+            f"Parakeet EOU preprocessor ({max_audio_seconds}s window)",
+        )
+        # === Export Encoder ===
+        typer.echo("Tracing and converting encoder…")
+        traced_encoder = torch.jit.trace(
+            encoder, (mel_ref, mel_length_ref), strict=False
+        )
+        traced_encoder.eval()
+        encoder_inputs = [
+            ct.TensorType(
+                name="mel", shape=_tensor_shape(mel_ref), dtype=np.float32
+            ),
+            ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
+        ]
+        encoder_outputs = [
+            ct.TensorType(name="encoder", dtype=np.float32),
+            ct.TensorType(name="encoder_length", dtype=np.int32),
+            ct.TensorType(name="frame_times", dtype=np.float32),
+        ]
+        encoder_model = _coreml_convert(
+            traced_encoder,
+            encoder_inputs,
+            encoder_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        encoder_path = output_dir / "parakeet_eou_encoder.mlpackage"
+        _save_mlpackage(
+            encoder_model,
+            encoder_path,
+            f"Parakeet EOU encoder ({max_audio_seconds}s window)",
+        )
+        # === Export Fused Mel+Encoder ===
+        typer.echo("Tracing and converting fused mel+encoder…")
+        mel_encoder = MelEncoderWrapper(preprocessor, encoder)
+        traced_mel_encoder = torch.jit.trace(
+            mel_encoder, (audio_tensor, audio_length), strict=False
+        )
+        traced_mel_encoder.eval()
+        mel_encoder_inputs = [
+            ct.TensorType(
+                name="audio_signal", shape=(1, max_samples), dtype=np.float32
+            ),
+            ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
+        ]
+        mel_encoder_outputs = [
+            ct.TensorType(name="encoder", dtype=np.float32),
+            ct.TensorType(name="encoder_length", dtype=np.int32),
+            ct.TensorType(name="frame_times", dtype=np.float32),
+        ]
+        mel_encoder_model = _coreml_convert(
+            traced_mel_encoder,
+            mel_encoder_inputs,
+            mel_encoder_outputs,
+            export_settings,
+            compute_units_override=melenc_cu,
+        )
+        mel_encoder_path = output_dir / "parakeet_eou_mel_encoder.mlpackage"
+        _save_mlpackage(
+            mel_encoder_model,
+            mel_encoder_path,
+            f"Parakeet EOU fused Mel+Encoder ({max_audio_seconds}s window)",
+        )
+        # === Export Decoder ===
+        typer.echo("Tracing and converting decoder…")
+        traced_decoder = torch.jit.trace(
+            decoder,
+            (targets, target_lengths, zero_state, zero_state),
+            strict=False,
+        )
+        traced_decoder.eval()
+        decoder_inputs = [
+            ct.TensorType(
+                name="targets", shape=_tensor_shape(targets), dtype=np.int32
+            ),
+            ct.TensorType(name="target_length", shape=(1,), dtype=np.int32),
+            ct.TensorType(
+                name="h_in", shape=_tensor_shape(zero_state), dtype=np.float32
+            ),
+            ct.TensorType(
+                name="c_in", shape=_tensor_shape(zero_state), dtype=np.float32
+            ),
+        ]
+        decoder_outputs = [
+            ct.TensorType(name="decoder", dtype=np.float32),
+            ct.TensorType(name="h_out", dtype=np.float32),
+            ct.TensorType(name="c_out", dtype=np.float32),
+        ]
+        decoder_model = _coreml_convert(
+            traced_decoder,
+            decoder_inputs,
+            decoder_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        decoder_path = output_dir / "parakeet_eou_decoder.mlpackage"
+        _save_mlpackage(
+            decoder_model,
+            decoder_path,
+            "Parakeet EOU decoder (RNNT prediction network)",
+        )
+        # === Export Joint ===
+        typer.echo("Tracing and converting joint…")
+        traced_joint = torch.jit.trace(
+            joint,
+            (encoder_ref, decoder_ref),
+            strict=False,
+        )
+        traced_joint.eval()
+        joint_inputs = [
+            ct.TensorType(
+                name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32
+            ),
+            ct.TensorType(
+                name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32
+            ),
+        ]
+        joint_outputs = [
+            ct.TensorType(name="logits", dtype=np.float32),
+        ]
+        joint_model = _coreml_convert(
+            traced_joint,
+            joint_inputs,
+            joint_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        joint_path = output_dir / "parakeet_eou_joint.mlpackage"
+        _save_mlpackage(
+            joint_model,
+            joint_path,
+            "Parakeet EOU joint network (RNNT)",
+        )
+        # === Export Joint Decision Head ===
+        typer.echo("Tracing and converting joint decision head…")
+        joint_decision = JointDecisionWrapper(joint, vocab_size=vocab_size)
+        traced_joint_decision = torch.jit.trace(
+            joint_decision,
+            (encoder_ref, decoder_ref),
+            strict=False,
+        )
+        traced_joint_decision.eval()
+        joint_decision_inputs = [
+            ct.TensorType(
+                name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32
+            ),
+            ct.TensorType(
+                name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32
+            ),
+        ]
+        joint_decision_outputs = [
+            ct.TensorType(name="token_id", dtype=np.int32),
+            ct.TensorType(name="token_prob", dtype=np.float32),
+        ]
+        joint_decision_model = _coreml_convert(
+            traced_joint_decision,
+            joint_decision_inputs,
+            joint_decision_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        joint_decision_path = output_dir / "parakeet_eou_joint_decision.mlpackage"
+        _save_mlpackage(
+            joint_decision_model,
+            joint_decision_path,
+            "Parakeet EOU joint + decision head (softmax, argmax)",
+        )
+        # === Export Single-Step Joint Decision ===
+        typer.echo("Tracing and converting single-step joint decision…")
+        jd_single = JointDecisionSingleStep(joint, vocab_size=vocab_size)
+        # Create single-step slices from refs
+        enc_step = encoder_ref[:, :, :1].contiguous()
+        dec_step = decoder_ref[:, :, :1].contiguous()
+        traced_jd_single = torch.jit.trace(
+            jd_single,
+            (enc_step, dec_step),
+            strict=False,
+        )
+        traced_jd_single.eval()
+        jd_single_inputs = [
+            ct.TensorType(
+                name="encoder_step",
+                shape=(1, enc_step.shape[1], 1),
+                dtype=np.float32,
+            ),
+            ct.TensorType(
+                name="decoder_step",
+                shape=(1, dec_step.shape[1], 1),
+                dtype=np.float32,
+            ),
+        ]
+        jd_single_outputs = [
+            ct.TensorType(name="token_id", dtype=np.int32),
+            ct.TensorType(name="token_prob", dtype=np.float32),
+            ct.TensorType(name="top_k_ids", dtype=np.int32),
+            ct.TensorType(name="top_k_logits", dtype=np.float32),
+        ]
+        jd_single_model = _coreml_convert(
+            traced_jd_single,
+            jd_single_inputs,
+            jd_single_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        jd_single_path = output_dir / "parakeet_eou_joint_decision_single_step.mlpackage"
+        _save_mlpackage(
+            jd_single_model,
+            jd_single_path,
+            "Parakeet EOU single-step joint decision (current frame)",
+        )
+        # === Save Metadata ===
+        metadata: Dict[str, object] = {
+            "model_id": model_id,
+            "model_name": "parakeet_realtime_eou_120m-v1",
+            "model_class": type(asr_model).__name__,
+            "encoder_class": type(asr_model.encoder).__name__,
+            "sample_rate": sample_rate,
+            "max_audio_seconds": export_settings.max_audio_seconds,
+            "max_audio_samples": max_samples,
+            "max_symbol_steps": export_settings.max_symbol_steps,
+            "vocab_size": vocab_size,
+            "vocab_with_blank": vocab_size + 1,
+            "decoder_hidden": decoder_hidden,
+            "decoder_layers": decoder_layers,
+            "num_extra_outputs": num_extra,
+            "has_eou_token": True,
+            "checkpoint": checkpoint_meta,
+            "coreml": {
+                "compute_units": export_settings.compute_units.name,
+                "compute_precision": (
+                    export_settings.compute_precision.name
+                    if export_settings.compute_precision is not None
+                    else "FLOAT32"
+                ),
+            },
+            "components": {
+                "preprocessor": {
+                    "inputs": {
+                        "audio_signal": [1, max_samples],
+                        "audio_length": [1],
+                    },
+                    "outputs": {
+                        "mel": list(_tensor_shape(mel_ref)),
+                        "mel_length": [1],
+                    },
+                    "path": preprocessor_path.name,
+                },
+                "encoder": {
+                    "inputs": {
+                        "mel": list(_tensor_shape(mel_ref)),
+                        "mel_length": [1],
+                    },
+                    "outputs": {
+                        "encoder": list(_tensor_shape(encoder_ref)),
+                        "encoder_length": [1],
+                        "frame_times": [1, _tensor_shape(encoder_ref)[2]],
+                    },
+                    "path": encoder_path.name,
+                },
+                "mel_encoder": {
+                    "inputs": {
+                        "audio_signal": [1, max_samples],
+                        "audio_length": [1],
+                    },
+                    "outputs": {
+                        "encoder": list(_tensor_shape(encoder_ref)),
+                        "encoder_length": [1],
+                        "frame_times": [1, _tensor_shape(encoder_ref)[2]],
+                    },
+                    "path": mel_encoder_path.name,
+                },
+                "decoder": {
+                    "inputs": {
+                        "targets": list(_tensor_shape(targets)),
+                        "target_length": [1],
+                        "h_in": list(_tensor_shape(zero_state)),
+                        "c_in": list(_tensor_shape(zero_state)),
+                    },
+                    "outputs": {
+                        "decoder": list(_tensor_shape(decoder_ref)),
+                        "h_out": list(_tensor_shape(h_ref)),
+                        "c_out": list(_tensor_shape(c_ref)),
+                    },
+                    "path": decoder_path.name,
+                },
+                "joint": {
+                    "inputs": {
+                        "encoder": list(_tensor_shape(encoder_ref)),
+                        "decoder": list(_tensor_shape(decoder_ref)),
+                    },
+                    "outputs": {
+                        "logits": list(_tensor_shape(joint_ref)),
+                    },
+                    "path": joint_path.name,
+                },
+                "joint_decision": {
+                    "inputs": {
+                        "encoder": list(_tensor_shape(encoder_ref)),
+                        "decoder": list(_tensor_shape(decoder_ref)),
+                    },
+                    "outputs": {
+                        "token_id": [
+                            _tensor_shape(encoder_ref)[0],
+                            _tensor_shape(encoder_ref)[2],
+                            _tensor_shape(decoder_ref)[2],
+                        ],
+                        "token_prob": [
+                            _tensor_shape(encoder_ref)[0],
+                            _tensor_shape(encoder_ref)[2],
+                            _tensor_shape(decoder_ref)[2],
+                        ],
+                    },
+                    "path": joint_decision_path.name,
+                },
+                "joint_decision_single_step": {
+                    "inputs": {
+                        "encoder_step": [1, _tensor_shape(encoder_ref)[1], 1],
+                        "decoder_step": [1, _tensor_shape(decoder_ref)[1], 1],
+                    },
+                    "outputs": {
+                        "token_id": [1, 1, 1],
+                        "token_prob": [1, 1, 1],
+                        "top_k_ids": [1, 1, 1, 64],
+                        "top_k_logits": [1, 1, 1, 64],
+                    },
+                    "path": jd_single_path.name,
+                },
+            },
+        }
+        # Export tokenizer vocab if available
+        try:
+            tokenizer = asr_model.tokenizer
+            vocab = {
+                "blank_id": int(asr_model.decoder.blank_idx),
+                "vocab_size": vocab_size,
+            }
+            # Try to get special tokens
+            if hasattr(tokenizer, "tokenizer"):
+                inner_tokenizer = tokenizer.tokenizer
+                if hasattr(inner_tokenizer, "get_vocab"):
+                    full_vocab = inner_tokenizer.get_vocab()
+                    # Find EOU token
+                    eou_token = None
+                    for token, idx in full_vocab.items():
+                        if "<EOU>" in token.upper() or "eou" in token.lower():
+                            eou_token = {"token": token, "id": idx}
+                            break
+                    if eou_token:
+                        vocab["eou_token"] = eou_token
+            metadata["tokenizer"] = vocab
+        except Exception as e:
+            typer.echo(f"Warning: Could not export tokenizer info: {e}")
+        metadata_path = output_dir / "metadata.json"
+        metadata_path.write_text(json.dumps(metadata, indent=2))
+        typer.echo(f"\nExport complete. Metadata written to {metadata_path}")
+        typer.echo(f"Output directory: {output_dir}")
+    finally:
+        asr_model.decoder._rnnt_export = decoder_export_flag
+if __name__ == "__main__":
+    app()

1280ms/convert_streaming_encoder.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import torch
+import torch.nn as nn
+import coremltools as ct
+import numpy as np
+import typer
+from pathlib import Path
+from typing import Tuple, List, Optional
+import json
+import shutil
+# Iimport torch
+import coremltools as ct
+import numpy as np
+import argparse
+from nemo.collections.asr.models import EncDecRNNTBPEModel
+app = typer.Typer()
+class LoopbackEncoderWrapper(nn.Module):
+    """
+    Wraps the entire Parakeet Encoder (PreEncode + Conformer) for CoreML Loopback Streaming.
+    Inputs:
+      - audio_signal: [B, D, T] (Mel spectrogram chunk)
+      - audio_length: [B]
+      - pre_cache: [B, D, pre_cache_size] (Previous audio context)
+      - cache_last_channel: [layers, B, cache_size, hidden]
+      - cache_last_time: [layers, B, hidden, time_cache]
+      - cache_last_channel_len: [B]
+    Outputs:
+      - encoded_output: [B, D_out, T_out]
+      - encoded_length: [B]
+      - new_pre_cache: [B, D, pre_cache_size]
+      - new_cache_last_channel
+      - new_cache_last_time
+      - new_cache_last_channel_len
+    """
+    def __init__(self, encoder, pre_cache_size=16):
+        super().__init__()
+        self.encoder = encoder
+        self.pre_cache_size = pre_cache_size
+    def forward(
+        self,
+        audio_signal: torch.Tensor,
+        audio_length: torch.Tensor,
+        pre_cache: torch.Tensor,
+        cache_last_channel: torch.Tensor,
+        cache_last_time: torch.Tensor,
+        cache_last_channel_len: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # 1. Prepend pre_cache to audio_signal
+        # audio_signal: [B, D, T]
+        # pre_cache: [B, D, T_cache]
+        full_input = torch.cat([pre_cache, audio_signal], dim=2)
+        full_length = audio_length + self.pre_cache_size
+        # 2. Extract NEW pre_cache (last N frames of full_input)
+        # Note: We do this BEFORE processing because we want the raw audio context
+        new_pre_cache = full_input[:, :, -self.pre_cache_size:]
+        # 3. Process with Encoder
+        # Reconstruct NeMo cache object
+        current_cache = [cache_last_channel, cache_last_time, cache_last_channel_len]
+        encoded, encoded_len, new_cache_channel, new_cache_time, new_cache_len = self.encoder.cache_aware_stream_step(
+            processed_signal=full_input,
+            processed_signal_length=full_length,
+            cache_last_channel=cache_last_channel,
+            cache_last_time=cache_last_time,
+            cache_last_channel_len=cache_last_channel_len
+        )
+        # 4. Drop the first few frames corresponding to pre_cache?
+        # NeMo's cache_aware_stream_step usually handles the "valid" output frames.
+        # But since we manually prepended, we might get extra output frames.
+        # However, for streaming, we usually want the model to see the context but only output the new tokens.
+        # Let's trust NeMo's streaming logic for now, or check if we need to slice.
+        # Given we are using 'cache_aware_stream_step', it expects the full context window?
+        # Actually, standard usage is: input IS the new chunk, but internal convolution looks at past.
+        # But since we are stateless, we MUST provide the past.
+        # So passing (pre_cache + chunk) is correct.
+        # Cast lengths to Int32 for CoreML
+        encoded_len_32 = encoded_len.to(dtype=torch.int32)
+        new_channel_len_32 = new_cache_len.to(dtype=torch.int32)
+        return encoded, encoded_len_32, new_pre_cache, new_cache_channel, new_cache_time, new_channel_len_32
+def _coreml_convert(
+    traced_model,
+    inputs,
+    outputs,
+    compute_units=ct.ComputeUnit.CPU_ONLY
+):
+    return ct.convert(
+        traced_model,
+        inputs=inputs,
+        outputs=outputs,
+        compute_units=compute_units,
+        minimum_deployment_target=ct.target.macOS14,
+    )
+def main():
+    model_id: str = "nvidia/parakeet_realtime_eou_120m-v1"
+    output_dir: str = "temp_swift_models/StreamingLoopback"
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    print(f"Loading model: {model_id}...")
+    asr_model = EncDecRNNTBPEModel.from_pretrained(model_name=model_id)
+    asr_model.eval()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chunk-frames", type=int, default=17, help="Number of frames in the input chunk (e.g. 17 for 160ms, 129 for 1.28s)")
+    args = parser.parse_args()
+    encoder = asr_model.encoder
+    # --- Configuration ---
+    # 160ms chunk = 16 frames (but preprocessor produces 17 with padding/centering)
+    # 1.28s chunk = 128 frames (preprocessor produces 129)
+    chunk_size_in = args.chunk_frames
+    mel_dim = 128
+    hidden_dim = encoder.d_model # 512
+    num_layers = len(encoder.layers) # 17
+    # Cache sizes
+    cache_channel_size = 70
+    cache_time_size = 8
+    pre_cache_size = 16
+    print(f"Config: Chunk={chunk_size_in}, Mel={mel_dim}, Hidden={hidden_dim}, Layers={num_layers}")
+    print(f"Cache: Channel={cache_channel_size}, Time={cache_time_size}, Pre={pre_cache_size}")
+    # --- Wrapper ---
+    wrapper = LoopbackEncoderWrapper(encoder, pre_cache_size=pre_cache_size)
+    wrapper.eval()
+    # --- Test Inputs (for Tracing) ---
+    batch_size = 1
+    test_mel = torch.randn(batch_size, mel_dim, chunk_size_in)
+    test_mel_len = torch.tensor([chunk_size_in], dtype=torch.int32)
+    test_pre_cache = torch.zeros(batch_size, mel_dim, pre_cache_size)
+    # Initial Cache (Zeros)
+    test_cache_channel = torch.zeros(num_layers, batch_size, cache_channel_size, hidden_dim)
+    test_cache_time = torch.zeros(num_layers, batch_size, hidden_dim, cache_time_size)
+    test_cache_len = torch.zeros(batch_size, dtype=torch.int32)
+    print("Tracing model...")
+    traced_model = torch.jit.trace(
+        wrapper,
+        (test_mel, test_mel_len, test_pre_cache, test_cache_channel, test_cache_time, test_cache_len),
+        strict=False
+    )
+    # --- CoreML Conversion ---
+    print("Converting to CoreML...")
+    inputs = [
+        ct.TensorType(name="audio_signal", shape=(1, 128, chunk_size_in), dtype=np.float32),
+        ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
+        ct.TensorType(name="pre_cache", shape=(1, 128, pre_cache_size), dtype=np.float32),
+        ct.TensorType(name="cache_last_channel", shape=(num_layers, 1, cache_channel_size, hidden_dim), dtype=np.float32),
+        ct.TensorType(name="cache_last_time", shape=(num_layers, 1, hidden_dim, cache_time_size), dtype=np.float32),
+        ct.TensorType(name="cache_last_channel_len", shape=(1,), dtype=np.int32),
+    ]
+    outputs = [
+        ct.TensorType(name="encoded_output", dtype=np.float32),
+        ct.TensorType(name="encoded_length", dtype=np.int32),
+        ct.TensorType(name="new_pre_cache", dtype=np.float32),
+        ct.TensorType(name="new_cache_last_channel", dtype=np.float32),
+        ct.TensorType(name="new_cache_last_time", dtype=np.float32),
+        ct.TensorType(name="new_cache_last_channel_len", dtype=np.int32),
+    ]
+    mlmodel = _coreml_convert(traced_model, inputs, outputs)
+    save_path = output_path / "streaming_encoder.mlpackage"
+    mlmodel.save(str(save_path))
+    print(f"Saved: {save_path}")
+    # Also export Preprocessor, Decoder, Joint for completeness?
+    # For now, let's assume we reuse the existing ones or export them separately if needed.
+    # But the user asked specifically for the Encoder loopback.
+if __name__ == "__main__":
+    main()

1280ms/decoder.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3996975a8cbc1949159c55605b3132b39b2484f51acbd55d796d93c70de02b49
+size 243

1280ms/decoder.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3ccbff963d8cf07e2be2bd56ea3384a89ea49628922c6bd95ff62e2ae57dc34
+size 497

1280ms/decoder.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,118 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "shortDescription" : "Parakeet EOU decoder (RNNT prediction network)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 640, 1]",
+        "name" : "decoder",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 640]",
+        "name" : "h_out",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 640]",
+        "name" : "c_out",
+        "type" : "MultiArray"
+      }
+    ],
+    "storagePrecision" : "Float16",
+    "modelParameters" : [
+    ],
+    "author" : "Fluid Inference",
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios17.squeeze" : 2,
+      "Ios17.gather" : 1,
+      "Ios17.cast" : 6,
+      "Ios17.lstm" : 1,
+      "Ios17.transpose" : 2,
+      "Identity" : 1,
+      "Ios17.expandDims" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1]",
+        "name" : "targets",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "target_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 640]",
+        "name" : "h_in",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 640]",
+        "name" : "c_in",
+        "type" : "MultiArray"
+      }
+    ],
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.version" : "8.3.0",
+      "com.github.apple.coremltools.source" : "torch==2.4.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "generatedClassName" : "parakeet_eou_decoder",
+    "method" : "predict"
+  }
+]

1280ms/decoder.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,45 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
+{
+    func main<ios17>(tensor<fp32, [1, 1, 640]> c_in, tensor<fp32, [1, 1, 640]> h_in, tensor<int32, [1]> target_length, tensor<int32, [1, 1]> targets) {
+            tensor<int32, []> y_axis_0 = const()[name = tensor<string, []>("y_axis_0"), val = tensor<int32, []>(0)];
+            tensor<int32, []> y_batch_dims_0 = const()[name = tensor<string, []>("y_batch_dims_0"), val = tensor<int32, []>(0)];
+            tensor<bool, []> y_validate_indices_0 = const()[name = tensor<string, []>("y_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1027, 640]> module_prediction_embed_weight_to_fp16 = const()[name = tensor<string, []>("module_prediction_embed_weight_to_fp16"), val = tensor<fp16, [1027, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<string, []> targets_to_int16_dtype_0 = const()[name = tensor<string, []>("targets_to_int16_dtype_0"), val = tensor<string, []>("int16")];
+            tensor<int16, [1, 1]> targets_to_int16 = cast(dtype = targets_to_int16_dtype_0, x = targets)[name = tensor<string, []>("cast_8")];
+            tensor<fp16, [1, 1, 640]> y_cast_fp16_cast_uint16 = gather(axis = y_axis_0, batch_dims = y_batch_dims_0, indices = targets_to_int16, validate_indices = y_validate_indices_0, x = module_prediction_embed_weight_to_fp16)[name = tensor<string, []>("y_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<int32, [1]> input_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<string, []> h_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("h_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<fp16, [1, 1, 640]> h_in_to_fp16 = cast(dtype = h_in_to_fp16_dtype_0, x = h_in)[name = tensor<string, []>("cast_7")];
+            tensor<fp16, [1, 640]> input_lstm_h0_squeeze_cast_fp16 = squeeze(axes = input_lstm_h0_squeeze_axes_0, x = h_in_to_fp16)[name = tensor<string, []>("input_lstm_h0_squeeze_cast_fp16")];
+            tensor<int32, [1]> input_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<string, []> c_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("c_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<fp16, [1, 1, 640]> c_in_to_fp16 = cast(dtype = c_in_to_fp16_dtype_0, x = c_in)[name = tensor<string, []>("cast_6")];
+            tensor<fp16, [1, 640]> input_lstm_c0_squeeze_cast_fp16 = squeeze(axes = input_lstm_c0_squeeze_axes_0, x = c_in_to_fp16)[name = tensor<string, []>("input_lstm_c0_squeeze_cast_fp16")];
+            tensor<string, []> input_direction_0 = const()[name = tensor<string, []>("input_direction_0"), val = tensor<string, []>("forward")];
+            tensor<bool, []> input_output_sequence_0 = const()[name = tensor<string, []>("input_output_sequence_0"), val = tensor<bool, []>(true)];
+            tensor<string, []> input_recurrent_activation_0 = const()[name = tensor<string, []>("input_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
+            tensor<string, []> input_cell_activation_0 = const()[name = tensor<string, []>("input_cell_activation_0"), val = tensor<string, []>("tanh")];
+            tensor<string, []> input_activation_0 = const()[name = tensor<string, []>("input_activation_0"), val = tensor<string, []>("tanh")];
+            tensor<fp16, [2560, 640]> concat_1_to_fp16 = const()[name = tensor<string, []>("concat_1_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1314688)))];
+            tensor<fp16, [2560, 640]> concat_2_to_fp16 = const()[name = tensor<string, []>("concat_2_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4591552)))];
+            tensor<fp16, [2560]> concat_0_to_fp16 = const()[name = tensor<string, []>("concat_0_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7868416)))];
+            tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = y_cast_fp16_cast_uint16)[name = tensor<string, []>("transpose_2")];
+            tensor<fp16, [1, 1, 640]> input_cast_fp16_0, tensor<fp16, [1, 640]> input_cast_fp16_1, tensor<fp16, [1, 640]> input_cast_fp16_2 = lstm(activation = input_activation_0, bias = concat_0_to_fp16, cell_activation = input_cell_activation_0, direction = input_direction_0, initial_c = input_lstm_c0_squeeze_cast_fp16, initial_h = input_lstm_h0_squeeze_cast_fp16, output_sequence = input_output_sequence_0, recurrent_activation = input_recurrent_activation_0, weight_hh = concat_2_to_fp16, weight_ih = concat_1_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [1]> obj_3_axes_0 = const()[name = tensor<string, []>("obj_3_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 1, 640]> obj_3_cast_fp16 = expand_dims(axes = obj_3_axes_0, x = input_cast_fp16_1)[name = tensor<string, []>("obj_3_cast_fp16")];
+            tensor<string, []> obj_3_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_3_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, [1]> obj_axes_0 = const()[name = tensor<string, []>("obj_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 1, 640]> obj_cast_fp16 = expand_dims(axes = obj_axes_0, x = input_cast_fp16_2)[name = tensor<string, []>("obj_cast_fp16")];
+            tensor<string, []> obj_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, [3]> transpose_0_perm_0 = const()[name = tensor<string, []>("transpose_0_perm_0"), val = tensor<int32, [3]>([1, 2, 0])];
+            tensor<string, []> transpose_0_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("transpose_0_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<fp16, [1, 640, 1]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = input_cast_fp16_0)[name = tensor<string, []>("transpose_1")];
+            tensor<fp32, [1, 640, 1]> decoder = cast(dtype = transpose_0_cast_fp16_to_fp32_dtype_0, x = transpose_0_cast_fp16)[name = tensor<string, []>("cast_3")];
+            tensor<fp32, [1, 1, 640]> c_out = cast(dtype = obj_cast_fp16_to_fp32_dtype_0, x = obj_cast_fp16)[name = tensor<string, []>("cast_4")];
+            tensor<fp32, [1, 1, 640]> h_out = cast(dtype = obj_3_cast_fp16_to_fp32_dtype_0, x = obj_3_cast_fp16)[name = tensor<string, []>("cast_5")];
+            tensor<int32, [1]> target_length_tmp = identity(x = target_length)[name = tensor<string, []>("target_length_tmp")];
+        } -> (decoder, h_out, c_out);
+}

1280ms/decoder.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b4cacecdcd9df79ab1e56de67230baf5a8664d2afe0bb8f3408eefa972cb2f4
+size 7873600

1280ms/individual_components.py ADDED Viewed

	@@ -0,0 +1,250 @@

+#!/usr/bin/env python3
+"""Export Parakeet Realtime EOU 120M RNNT components into CoreML."""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Tuple
+import coremltools as ct
+import torch
+@dataclass
+class ExportSettings:
+    output_dir: Path
+    compute_units: ct.ComputeUnit
+    deployment_target: Optional[ct.target]
+    compute_precision: Optional[ct.precision]
+    max_audio_seconds: float
+    max_symbol_steps: int
+class PreprocessorWrapper(torch.nn.Module):
+    """Wrapper for the audio preprocessor (mel spectrogram extraction)."""
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(
+        self, audio_signal: torch.Tensor, length: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        mel, mel_length = self.module(
+            input_signal=audio_signal, length=length.to(dtype=torch.long)
+        )
+        return mel, mel_length
+class EncoderWrapper(torch.nn.Module):
+    """Wrapper for the cache-aware FastConformer encoder.
+    Note: For the realtime EOU model, the encoder is cache-aware which means
+    it can operate in a streaming fashion. For CoreML export, we export
+    without cache state for simplicity (full-context mode).
+    """
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(
+        self, features: torch.Tensor, length: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        encoded, encoded_lengths = self.module(
+            audio_signal=features, length=length.to(dtype=torch.long)
+        )
+        # Synthesize per-frame timestamps (seconds) using the 80 ms encoder stride.
+        # Shape: [B, T_enc]
+        frame_times = (
+            torch.arange(encoded.shape[-1], device=encoded.device, dtype=torch.float32)
+            * 0.08
+        )
+        return encoded, encoded_lengths, frame_times
+class DecoderWrapper(torch.nn.Module):
+    """Wrapper for the RNNT prediction network (decoder)."""
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(
+        self,
+        targets: torch.Tensor,
+        target_lengths: torch.Tensor,
+        h_in: torch.Tensor,
+        c_in: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        state = [h_in, c_in]
+        decoder_output, _, new_state = self.module(
+            targets=targets.to(dtype=torch.long),
+            target_length=target_lengths.to(dtype=torch.long),
+            states=state,
+        )
+        return decoder_output, new_state[0], new_state[1]
+class JointWrapper(torch.nn.Module):
+    """Wrapper for the RNNT joint network.
+    Note: Unlike Parakeet TDT v3, the realtime EOU model does NOT have
+    duration outputs (num_extra_outputs). The joint network outputs only
+    token logits over the vocabulary + blank.
+    """
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(
+        self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor
+    ) -> torch.Tensor:
+        # Input: encoder_outputs [B, D, T], decoder_outputs [B, D, U]
+        # Transpose to match what projection layers expect
+        encoder_outputs = encoder_outputs.transpose(1, 2)  # [B, T, D]
+        decoder_outputs = decoder_outputs.transpose(1, 2)  # [B, U, D]
+        # Apply projections
+        enc_proj = self.module.enc(encoder_outputs)  # [B, T, joint_hidden]
+        dec_proj = self.module.pred(decoder_outputs)  # [B, U, joint_hidden]
+        # Explicit broadcasting along T and U to avoid converter ambiguity
+        x = enc_proj.unsqueeze(2) + dec_proj.unsqueeze(1)  # [B, T, U, joint_hidden]
+        x = self.module.joint_net[0](x)  # ReLU
+        x = self.module.joint_net[1](x)  # Dropout (no-op in eval)
+        out = self.module.joint_net[2](x)  # Linear -> logits [B, T, U, vocab+blank]
+        return out
+class MelEncoderWrapper(torch.nn.Module):
+    """Fused wrapper: waveform -> mel -> encoder.
+    Inputs:
+      - audio_signal: [B, S]
+      - audio_length: [B]
+    Outputs:
+      - encoder: [B, D, T_enc]
+      - encoder_length: [B]
+      - frame_times: [T_enc]
+    """
+    def __init__(
+        self, preprocessor: PreprocessorWrapper, encoder: EncoderWrapper
+    ) -> None:
+        super().__init__()
+        self.preprocessor = preprocessor
+        self.encoder = encoder
+    def forward(
+        self, audio_signal: torch.Tensor, audio_length: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        mel, mel_length = self.preprocessor(audio_signal, audio_length)
+        encoded, enc_len, frame_times = self.encoder(mel, mel_length.to(dtype=torch.int32))
+        return encoded, enc_len, frame_times
+class JointDecisionWrapper(torch.nn.Module):
+    """Joint + decision head: outputs label id and label prob.
+    Unlike Parakeet TDT v3, this model does NOT have duration outputs.
+    Inputs:
+      - encoder_outputs: [B, D, T]
+      - decoder_outputs: [B, D, U]
+    Returns:
+      - token_id: [B, T, U] int32
+      - token_prob: [B, T, U] float32
+    """
+    def __init__(self, joint: JointWrapper, vocab_size: int) -> None:
+        super().__init__()
+        self.joint = joint
+        self.vocab_with_blank = int(vocab_size) + 1
+    def forward(self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor):
+        logits = self.joint(encoder_outputs, decoder_outputs)
+        # Token selection
+        token_ids = torch.argmax(logits, dim=-1).to(dtype=torch.int32)
+        token_probs_all = torch.softmax(logits, dim=-1)
+        # gather expects int64 (long) indices; cast only for gather
+        token_prob = torch.gather(
+            token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
+        ).squeeze(-1)
+        return token_ids, token_prob
+class JointDecisionSingleStep(torch.nn.Module):
+    """Single-step variant for streaming: encoder_step -> token decision.
+    Inputs:
+      - encoder_step: [B=1, D, T=1]
+      - decoder_step: [B=1, D, U=1]
+    Returns:
+      - token_id: [1, 1, 1] int32
+      - token_prob: [1, 1, 1] float32
+      - top_k_ids: [1, 1, 1, K] int32
+      - top_k_logits: [1, 1, 1, K] float32
+    """
+    def __init__(self, joint: JointWrapper, vocab_size: int, top_k: int = 64) -> None:
+        super().__init__()
+        self.joint = joint
+        self.vocab_with_blank = int(vocab_size) + 1
+        self.top_k = int(top_k)
+    def forward(self, encoder_step: torch.Tensor, decoder_step: torch.Tensor):
+        # Reuse JointWrapper which expects [B, D, T] and [B, D, U]
+        logits = self.joint(encoder_step, decoder_step)  # [1, 1, 1, V+blank]
+        token_ids = torch.argmax(logits, dim=-1, keepdim=False).to(dtype=torch.int32)
+        token_probs_all = torch.softmax(logits, dim=-1)
+        token_prob = torch.gather(
+            token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
+        ).squeeze(-1)
+        # Also expose top-K candidates for host-side processing
+        topk_logits, topk_ids_long = torch.topk(
+            logits, k=min(self.top_k, logits.shape[-1]), dim=-1
+        )
+        topk_ids = topk_ids_long.to(dtype=torch.int32)
+        return token_ids, token_prob, topk_ids, topk_logits
+def _coreml_convert(
+    traced: torch.jit.ScriptModule,
+    inputs,
+    outputs,
+    settings: ExportSettings,
+    compute_units_override: Optional[ct.ComputeUnit] = None,
+    compute_precision: Optional[ct.precision] = None,
+) -> ct.models.MLModel:
+    cu = (
+        compute_units_override
+        if compute_units_override is not None
+        else settings.compute_units
+    )
+    kwargs = {
+        "convert_to": "mlprogram",
+        "inputs": inputs,
+        "outputs": outputs,
+        "compute_units": cu,
+    }
+    print("Converting:", traced.__class__.__name__)
+    print("Conversion kwargs:", kwargs)
+    if settings.deployment_target is not None:
+        kwargs["minimum_deployment_target"] = settings.deployment_target
+    # Priority: explicit argument > settings
+    if compute_precision is not None:
+        kwargs["compute_precision"] = compute_precision
+    elif settings.compute_precision is not None:
+        kwargs["compute_precision"] = settings.compute_precision
+    return ct.convert(traced, **kwargs)

1280ms/joint_decision.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5bca32ad130dcad6605cc00044c752aa5b45ef57d14c17f2d1a2fa49d6cf55b5
+size 243

1280ms/joint_decision.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22d4abc4625b935ee035b5f8ce7cb28d1041b9b01c12173e287bf4b5f5d99625
+size 493

1280ms/joint_decision.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,112 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "shortDescription" : "Parakeet EOU single-step joint decision",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1]",
+        "name" : "token_id",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1]",
+        "name" : "token_prob",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 1 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 64]",
+        "name" : "top_k_ids",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 64]",
+        "name" : "top_k_logits",
+        "type" : "MultiArray"
+      }
+    ],
+    "storagePrecision" : "Float16",
+    "modelParameters" : [
+    ],
+    "author" : "Fluid Inference",
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios17.reduceArgmax" : 1,
+      "Ios17.squeeze" : 1,
+      "Ios17.cast" : 6,
+      "Ios17.linear" : 3,
+      "Ios17.transpose" : 2,
+      "Ios17.add" : 1,
+      "Ios16.relu" : 1,
+      "Ios16.softmax" : 1,
+      "Ios17.gatherAlongAxis" : 1,
+      "Ios17.topk" : 1,
+      "Ios17.expandDims" : 3
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 512 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1]",
+        "name" : "encoder_step",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 640, 1]",
+        "name" : "decoder_step",
+        "type" : "MultiArray"
+      }
+    ],
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.version" : "8.3.0",
+      "com.github.apple.coremltools.source" : "torch==2.4.0"
+    },
+    "generatedClassName" : "parakeet_eou_joint_decision_single_step",
+    "method" : "predict"
+  }
+]

1280ms/joint_decision.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,57 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
+{
+    func main<ios17>(tensor<fp32, [1, 640, 1]> decoder_step, tensor<fp32, [1, 512, 1]> encoder_step) {
+            tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<string, []> encoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<string, []> decoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
+            tensor<fp16, [1, 512, 1]> encoder_step_to_fp16 = cast(dtype = encoder_step_to_fp16_dtype_0, x = encoder_step)[name = tensor<string, []>("cast_8")];
+            tensor<fp16, [1, 1, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_step_to_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 1, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
+            tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
+            tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
+            tensor<fp16, [1, 640, 1]> decoder_step_to_fp16 = cast(dtype = decoder_step_to_fp16_dtype_0, x = decoder_step)[name = tensor<string, []>("cast_7")];
+            tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_step_to_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
+            tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
+            tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<fp16, [1027, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1027, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
+            tensor<fp16, [1027]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1027]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2792064)))];
+            tensor<fp16, [1, 1, 1, 1027]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
+            tensor<int32, []> var_38_axis_0 = const()[name = tensor<string, []>("op_38_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_38_keep_dims_0 = const()[name = tensor<string, []>("op_38_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<string, []> var_38_output_dtype_0 = const()[name = tensor<string, []>("op_38_output_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, [1, 1, 1]> token_id = reduce_argmax(axis = var_38_axis_0, keep_dims = var_38_keep_dims_0, output_dtype = var_38_output_dtype_0, x = linear_2_cast_fp16)[name = tensor<string, []>("op_38_cast_fp16")];
+            tensor<int32, []> var_44 = const()[name = tensor<string, []>("op_44"), val = tensor<int32, []>(-1)];
+            tensor<fp16, [1, 1, 1, 1027]> token_probs_all_cast_fp16 = softmax(axis = var_44, x = linear_2_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
+            tensor<int32, [1]> var_53_axes_0 = const()[name = tensor<string, []>("op_53_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<int32, [1, 1, 1, 1]> var_53 = expand_dims(axes = var_53_axes_0, x = token_id)[name = tensor<string, []>("op_53")];
+            tensor<int32, []> var_54 = const()[name = tensor<string, []>("op_54"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_56_validate_indices_0 = const()[name = tensor<string, []>("op_56_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<string, []> var_53_to_int16_dtype_0 = const()[name = tensor<string, []>("op_53_to_int16_dtype_0"), val = tensor<string, []>("int16")];
+            tensor<int16, [1, 1, 1, 1]> var_53_to_int16 = cast(dtype = var_53_to_int16_dtype_0, x = var_53)[name = tensor<string, []>("cast_6")];
+            tensor<fp16, [1, 1, 1, 1]> var_56_cast_fp16_cast_int16 = gather_along_axis(axis = var_54, indices = var_53_to_int16, validate_indices = var_56_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_56_cast_fp16_cast_int16")];
+            tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp16, [1, 1, 1]> var_58_cast_fp16 = squeeze(axes = var_58_axes_0, x = var_56_cast_fp16_cast_int16)[name = tensor<string, []>("op_58_cast_fp16")];
+            tensor<string, []> var_58_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_58_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(64)];
+            tensor<int32, []> var_63_axis_0 = const()[name = tensor<string, []>("op_63_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_63_ascending_0 = const()[name = tensor<string, []>("op_63_ascending_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_63_sort_0 = const()[name = tensor<string, []>("op_63_sort_0"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_63_return_indices_0 = const()[name = tensor<string, []>("op_63_return_indices_0"), val = tensor<bool, []>(true)];
+            tensor<string, []> var_63_cast_fp16_cast_int16_output_indices_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_cast_int16_output_indices_dtype_0"), val = tensor<string, []>("uint16")];
+            tensor<fp16, [1, 1, 1, 64]> var_63_cast_fp16_cast_int16_0, tensor<uint16, [1, 1, 1, 64]> var_63_cast_fp16_cast_int16_1 = topk(ascending = var_63_ascending_0, axis = var_63_axis_0, k = var_59, output_indices_dtype = var_63_cast_fp16_cast_int16_output_indices_dtype_0, return_indices = var_63_return_indices_0, sort = var_63_sort_0, x = linear_2_cast_fp16)[name = tensor<string, []>("op_63_cast_fp16_cast_int16")];
+            tensor<string, []> var_63_cast_fp16_cast_int16_1_to_int32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_cast_int16_1_to_int32_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<string, []> var_63_cast_fp16_0_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_0_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<fp32, [1, 1, 1, 64]> top_k_logits = cast(dtype = var_63_cast_fp16_0_to_fp32_dtype_0, x = var_63_cast_fp16_cast_int16_0)[name = tensor<string, []>("cast_3")];
+            tensor<int32, [1, 1, 1, 64]> top_k_ids = cast(dtype = var_63_cast_fp16_cast_int16_1_to_int32_dtype_0, x = var_63_cast_fp16_cast_int16_1)[name = tensor<string, []>("cast_4")];
+            tensor<fp32, [1, 1, 1]> token_prob = cast(dtype = var_58_cast_fp16_to_fp32_dtype_0, x = var_58_cast_fp16)[name = tensor<string, []>("cast_5")];
+        } -> (token_id, token_prob, top_k_ids, top_k_logits);
+}

1280ms/joint_decision.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7039b2010a269153f5a96edf28637f921a86ef8822f248f2d6712f7a6bce84b4
+size 2794182

1280ms/parakeet_eou_preprocessor.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4ada8b0b99ac1d2ba7acbffacfbbf1a06cb69d30e9410d237ee0aa4c2b0ad63
+size 243

1280ms/parakeet_eou_preprocessor.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc7252fa47622fe39577361233627062019a3bb740fdbb5366a7bae09df0ec5e
+size 422

1280ms/parakeet_eou_preprocessor.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,105 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "shortDescription" : "Parakeet EOU preprocessor",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32)",
+        "shortDescription" : "",
+        "shape" : "[]",
+        "name" : "mel",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "mel_length",
+        "type" : "MultiArray"
+      }
+    ],
+    "storagePrecision" : "Float32",
+    "modelParameters" : [
+    ],
+    "author" : "Fluid Inference",
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Range1d" : 1,
+      "Ios17.reshape" : 2,
+      "Identity" : 1,
+      "Ios17.matmul" : 1,
+      "Ios17.expandDims" : 5,
+      "Select" : 1,
+      "Ios17.add" : 3,
+      "Ios17.sliceByIndex" : 3,
+      "Ios16.reduceSum" : 1,
+      "Shape" : 1,
+      "Ios17.gather" : 1,
+      "Pad" : 1,
+      "Ios17.log" : 1,
+      "Ios17.conv" : 2,
+      "Ios17.sub" : 2,
+      "Ios17.pow" : 1,
+      "Ios17.cast" : 2,
+      "Stack" : 1,
+      "Ios17.concat" : 1,
+      "Ios17.floorDiv" : 1,
+      "Ios17.greaterEqual" : 1,
+      "Ios17.mul" : 1
+    },
+    "computePrecision" : "Mixed (Float32, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "inputSchema" : [
+      {
+        "dataType" : "Float32",
+        "hasShapeFlexibility" : "1",
+        "isOptional" : "0",
+        "shapeFlexibility" : "1 × 1...32000",
+        "shapeRange" : "[[1, 1], [1, 32000]]",
+        "formattedType" : "MultiArray (Float32 1 × 1)",
+        "type" : "MultiArray",
+        "shape" : "[1, 1]",
+        "name" : "audio_signal",
+        "shortDescription" : ""
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "audio_length",
+        "type" : "MultiArray"
+      }
+    ],
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.4.0",
+      "com.github.apple.coremltools.version" : "8.3.0"
+    },
+    "generatedClassName" : "parakeet_eou_preprocessor",
+    "method" : "predict"
+  }
+]

1280ms/parakeet_eou_preprocessor.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,96 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
+{
+    func main<ios17>(tensor<int32, [1]> audio_length, tensor<fp32, [1, ?]> audio_signal) [FlexibleShapeInformation = tuple<tuple<tensor<string, []>, dict<tensor<string, []>, tensor<int32, [?]>>>, tuple<tensor<string, []>, dict<tensor<string, []>, list<tensor<int32, [2]>, ?>>>>((("DefaultShapes", {{"audio_signal", [1, 1]}}), ("RangeDims", {{"audio_signal", [[1, 1], [1, 32000]]}})))] {
+            tensor<int32, []> var_9 = const()[name = tensor<string, []>("op_9"), val = tensor<int32, []>(1)];
+            tensor<int32, []> var_10 = const()[name = tensor<string, []>("op_10"), val = tensor<int32, []>(160)];
+            tensor<int32, []> var_32 = const()[name = tensor<string, []>("op_32"), val = tensor<int32, []>(512)];
+            tensor<int32, [1]> var_33 = add(x = audio_length, y = var_32)[name = tensor<string, []>("op_33")];
+            tensor<int32, []> var_34 = const()[name = tensor<string, []>("op_34"), val = tensor<int32, []>(512)];
+            tensor<int32, [1]> var_35 = sub(x = var_33, y = var_34)[name = tensor<string, []>("op_35")];
+            tensor<int32, [1]> floor_div_0 = floor_div(x = var_35, y = var_10)[name = tensor<string, []>("floor_div_0")];
+            tensor<string, []> var_36_dtype_0 = const()[name = tensor<string, []>("op_36_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<fp32, []> var_37_promoted = const()[name = tensor<string, []>("op_37_promoted"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> var_36 = cast(dtype = var_36_dtype_0, x = floor_div_0)[name = tensor<string, []>("cast_11")];
+            tensor<fp32, [1]> seq_len_1 = add(x = var_36, y = var_37_promoted)[name = tensor<string, []>("seq_len_1")];
+            tensor<string, []> cast_2_dtype_0 = const()[name = tensor<string, []>("cast_2_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, [2]> var_41_begin_0 = const()[name = tensor<string, []>("op_41_begin_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [2]> var_41_end_0 = const()[name = tensor<string, []>("op_41_end_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<bool, [2]> var_41_end_mask_0 = const()[name = tensor<string, []>("op_41_end_mask_0"), val = tensor<bool, [2]>([true, false])];
+            tensor<bool, [2]> var_41_squeeze_mask_0 = const()[name = tensor<string, []>("op_41_squeeze_mask_0"), val = tensor<bool, [2]>([false, true])];
+            tensor<fp32, [1]> var_41 = slice_by_index(begin = var_41_begin_0, end = var_41_end_0, end_mask = var_41_end_mask_0, squeeze_mask = var_41_squeeze_mask_0, x = audio_signal)[name = tensor<string, []>("op_41")];
+            tensor<int32, [1]> var_42_axes_0 = const()[name = tensor<string, []>("op_42_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1, 1]> var_42 = expand_dims(axes = var_42_axes_0, x = var_41)[name = tensor<string, []>("op_42")];
+            tensor<int32, [2]> var_44_begin_0 = const()[name = tensor<string, []>("op_44_begin_0"), val = tensor<int32, [2]>([0, 1])];
+            tensor<int32, [2]> var_44_end_0 = const()[name = tensor<string, []>("op_44_end_0"), val = tensor<int32, [2]>([1, 0])];
+            tensor<bool, [2]> var_44_end_mask_0 = const()[name = tensor<string, []>("op_44_end_mask_0"), val = tensor<bool, [2]>([true, true])];
+            tensor<fp32, [1, ?]> var_44 = slice_by_index(begin = var_44_begin_0, end = var_44_end_0, end_mask = var_44_end_mask_0, x = audio_signal)[name = tensor<string, []>("op_44")];
+            tensor<int32, [2]> var_46_begin_0 = const()[name = tensor<string, []>("op_46_begin_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [2]> var_46_end_0 = const()[name = tensor<string, []>("op_46_end_0"), val = tensor<int32, [2]>([1, -1])];
+            tensor<bool, [2]> var_46_end_mask_0 = const()[name = tensor<string, []>("op_46_end_mask_0"), val = tensor<bool, [2]>([true, false])];
+            tensor<fp32, [1, ?]> var_46 = slice_by_index(begin = var_46_begin_0, end = var_46_end_0, end_mask = var_46_end_mask_0, x = audio_signal)[name = tensor<string, []>("op_46")];
+            tensor<fp32, []> var_47 = const()[name = tensor<string, []>("op_47"), val = tensor<fp32, []>(0x1.f0a3d8p-1)];
+            tensor<fp32, [1, ?]> var_48 = mul(x = var_46, y = var_47)[name = tensor<string, []>("op_48")];
+            tensor<fp32, [1, ?]> var_49 = sub(x = var_44, y = var_48)[name = tensor<string, []>("op_49")];
+            tensor<bool, []> input_1_interleave_0 = const()[name = tensor<string, []>("input_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, ?]> input_1 = concat(axis = var_9, interleave = input_1_interleave_0, values = (var_42, var_49))[name = tensor<string, []>("input_1")];
+            tensor<int32, [3]> concat_0x = const()[name = tensor<string, []>("concat_0x"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp32, [1, 1, ?]> input_3 = reshape(shape = concat_0x, x = input_1)[name = tensor<string, []>("input_3")];
+            tensor<fp32, []> const_1 = const()[name = tensor<string, []>("const_1"), val = tensor<fp32, []>(0x0p+0)];
+            tensor<int32, [6]> input_5_pad_0 = const()[name = tensor<string, []>("input_5_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 256, 256])];
+            tensor<string, []> input_5_mode_0 = const()[name = tensor<string, []>("input_5_mode_0"), val = tensor<string, []>("reflect")];
+            tensor<fp32, [1, 1, ?]> input_5 = pad(constant_val = const_1, mode = input_5_mode_0, pad = input_5_pad_0, x = input_3)[name = tensor<string, []>("input_5")];
+            tensor<int32, [2]> concat_1x = const()[name = tensor<string, []>("concat_1x"), val = tensor<int32, [2]>([1, -1])];
+            tensor<fp32, [1, ?]> input = reshape(shape = concat_1x, x = input_5)[name = tensor<string, []>("input")];
+            tensor<fp32, [257, 1, 512]> expand_dims_1 = const()[name = tensor<string, []>("expand_dims_1"), val = tensor<fp32, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [257, 1, 512]> expand_dims_2 = const()[name = tensor<string, []>("expand_dims_2"), val = tensor<fp32, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(526464)))];
+            tensor<int32, [1]> expand_dims_3 = const()[name = tensor<string, []>("expand_dims_3"), val = tensor<int32, [1]>([160])];
+            tensor<int32, [1]> expand_dims_4_axes_0 = const()[name = tensor<string, []>("expand_dims_4_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1, 1, ?]> expand_dims_4 = expand_dims(axes = expand_dims_4_axes_0, x = input)[name = tensor<string, []>("expand_dims_4")];
+            tensor<string, []> conv_0_pad_type_0 = const()[name = tensor<string, []>("conv_0_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [2]> conv_0_pad_0 = const()[name = tensor<string, []>("conv_0_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> conv_0_dilations_0 = const()[name = tensor<string, []>("conv_0_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> conv_0_groups_0 = const()[name = tensor<string, []>("conv_0_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 257, ?]> conv_0 = conv(dilations = conv_0_dilations_0, groups = conv_0_groups_0, pad = conv_0_pad_0, pad_type = conv_0_pad_type_0, strides = expand_dims_3, weight = expand_dims_1, x = expand_dims_4)[name = tensor<string, []>("conv_0")];
+            tensor<string, []> conv_1_pad_type_0 = const()[name = tensor<string, []>("conv_1_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [2]> conv_1_pad_0 = const()[name = tensor<string, []>("conv_1_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> conv_1_dilations_0 = const()[name = tensor<string, []>("conv_1_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> conv_1_groups_0 = const()[name = tensor<string, []>("conv_1_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 257, ?]> conv_1 = conv(dilations = conv_1_dilations_0, groups = conv_1_groups_0, pad = conv_1_pad_0, pad_type = conv_1_pad_type_0, strides = expand_dims_3, weight = expand_dims_2, x = expand_dims_4)[name = tensor<string, []>("conv_1")];
+            tensor<int32, []> stack_0_axis_0 = const()[name = tensor<string, []>("stack_0_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<fp32, [1, 257, ?, 2]> stack_0 = stack(axis = stack_0_axis_0, values = (conv_0, conv_1))[name = tensor<string, []>("stack_0")];
+            tensor<fp32, []> var_17_promoted = const()[name = tensor<string, []>("op_17_promoted"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1, 257, ?, 2]> var_65 = pow(x = stack_0, y = var_17_promoted)[name = tensor<string, []>("op_65")];
+            tensor<int32, [1]> var_67_axes_0 = const()[name = tensor<string, []>("op_67_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<bool, []> var_67_keep_dims_0 = const()[name = tensor<string, []>("op_67_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 257, ?]> var_67 = reduce_sum(axes = var_67_axes_0, keep_dims = var_67_keep_dims_0, x = var_65)[name = tensor<string, []>("op_67")];
+            tensor<fp32, [1, 257, ?]> x_9 = identity(x = var_67)[name = tensor<string, []>("x_9")];
+            tensor<fp32, [1, 128, 257]> const_2 = const()[name = tensor<string, []>("const_2"), val = tensor<fp32, [1, 128, 257]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1052864)))];
+            tensor<bool, []> x_11_transpose_x_0 = const()[name = tensor<string, []>("x_11_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> x_11_transpose_y_0 = const()[name = tensor<string, []>("x_11_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 128, ?]> x_11 = matmul(transpose_x = x_11_transpose_x_0, transpose_y = x_11_transpose_y_0, x = const_2, y = x_9)[name = tensor<string, []>("x_11")];
+            tensor<fp32, []> var_74 = const()[name = tensor<string, []>("op_74"), val = tensor<fp32, []>(0x1p-24)];
+            tensor<fp32, [1, 128, ?]> var_75 = add(x = x_11, y = var_74)[name = tensor<string, []>("op_75")];
+            tensor<fp32, []> x_epsilon_0 = const()[name = tensor<string, []>("x_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
+            tensor<fp32, [1, 128, ?]> x = log(epsilon = x_epsilon_0, x = var_75)[name = tensor<string, []>("x")];
+            tensor<int32, [3]> var_77_shape = shape(x = x)[name = tensor<string, []>("op_77_shape")];
+            tensor<int32, []> select_4 = const()[name = tensor<string, []>("select_4"), val = tensor<int32, []>(2)];
+            tensor<int32, []> gather_4_axis_0 = const()[name = tensor<string, []>("gather_4_axis_0"), val = tensor<int32, []>(0)];
+            tensor<int32, []> gather_4_batch_dims_0 = const()[name = tensor<string, []>("gather_4_batch_dims_0"), val = tensor<int32, []>(0)];
+            tensor<bool, []> gather_4_validate_indices_0 = const()[name = tensor<string, []>("gather_4_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<int32, []> gather_4 = gather(axis = gather_4_axis_0, batch_dims = gather_4_batch_dims_0, indices = select_4, validate_indices = gather_4_validate_indices_0, x = var_77_shape)[name = tensor<string, []>("gather_4")];
+            tensor<int32, []> const_3 = const()[name = tensor<string, []>("const_3"), val = tensor<int32, []>(0)];
+            tensor<int32, []> const_4 = const()[name = tensor<string, []>("const_4"), val = tensor<int32, []>(1)];
+            tensor<int32, [?]> mask_1 = range_1d(end = gather_4, start = const_3, step = const_4)[name = tensor<string, []>("mask_1")];
+            tensor<int32, [1]> expand_dims_0_axes_0 = const()[name = tensor<string, []>("expand_dims_0_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<int32, [1, ?]> expand_dims_0 = expand_dims(axes = expand_dims_0_axes_0, x = mask_1)[name = tensor<string, []>("expand_dims_0")];
+            tensor<int32, [1]> var_82_axes_0 = const()[name = tensor<string, []>("op_82_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [1]> mel_length = cast(dtype = cast_2_dtype_0, x = seq_len_1)[name = tensor<string, []>("cast_10")];
+            tensor<int32, [1, 1]> var_82 = expand_dims(axes = var_82_axes_0, x = mel_length)[name = tensor<string, []>("op_82")];
+            tensor<bool, [1, ?]> mask = greater_equal(x = expand_dims_0, y = var_82)[name = tensor<string, []>("mask")];
+            tensor<int32, [1]> var_84_axes_0 = const()[name = tensor<string, []>("op_84_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<bool, [1, 1, ?]> var_84 = expand_dims(axes = var_84_axes_0, x = mask)[name = tensor<string, []>("op_84")];
+            tensor<fp32, []> cast_7 = const()[name = tensor<string, []>("cast_7"), val = tensor<fp32, []>(0x0p+0)];
+            tensor<fp32, [1, 128, ?]> mel = select(a = cast_7, b = x, cond = var_84)[name = tensor<string, []>("processed_signal")];
+        } -> (mel, mel_length);
+}

1280ms/parakeet_eou_preprocessor.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:009bba4fde82dc55db9b55d77cf3ba5f791ce366c49f079285fe25a3b6e2291d
+size 1184512

1280ms/streaming_encoder.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0a3c84022a9d2dc769d38cf8f45e93423e20734d092e3c16db11fbf6dca4004
+size 243

1280ms/streaming_encoder.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41ce3f96c3d6b3333796fc4ed82cb0c9b4ea99396b88f8eec3ba24394ba2bb78
+size 671

1280ms/streaming_encoder.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,187 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 512 × 17)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 17]",
+        "name" : "encoded_output",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "encoded_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 128 × 16)",
+        "shortDescription" : "",
+        "shape" : "[1, 128, 16]",
+        "name" : "new_pre_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 17 × 1 × 70 × 512)",
+        "shortDescription" : "",
+        "shape" : "[17, 1, 70, 512]",
+        "name" : "new_cache_last_channel",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 17 × 1 × 512 × 8)",
+        "shortDescription" : "",
+        "shape" : "[17, 1, 512, 8]",
+        "name" : "new_cache_last_time",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "new_cache_last_channel_len",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+    ],
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios17.floor" : 3,
+      "Ios17.logicalAnd" : 3,
+      "Ios17.reshape" : 103,
+      "Ios16.softmax" : 17,
+      "Ios17.matmul" : 51,
+      "Ios17.transpose" : 157,
+      "Split" : 17,
+      "Ios17.expandDims" : 6,
+      "Select" : 51,
+      "Ios17.add" : 126,
+      "Tile" : 1,
+      "Ios17.sliceByIndex" : 106,
+      "Ios16.sigmoid" : 17,
+      "Pad" : 20,
+      "Ios17.logicalNot" : 2,
+      "Ios17.layerNorm" : 102,
+      "Ios17.less" : 1,
+      "Ios17.sub" : 1,
+      "Ios17.conv" : 56,
+      "Ios17.clip" : 2,
+      "Ios16.relu" : 3,
+      "Ios17.linear" : 137,
+      "Ios17.concat" : 52,
+      "Ios17.greaterEqual" : 1,
+      "Ios17.cast" : 14,
+      "Ios16.silu" : 51,
+      "Stack" : 2,
+      "Ios17.mul" : 72
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.4.0",
+      "com.github.apple.coremltools.version" : "8.3.0"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 128 × 129)",
+        "shortDescription" : "",
+        "shape" : "[1, 128, 129]",
+        "name" : "audio_signal",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "audio_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 128 × 16)",
+        "shortDescription" : "",
+        "shape" : "[1, 128, 16]",
+        "name" : "pre_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 17 × 1 × 70 × 512)",
+        "shortDescription" : "",
+        "shape" : "[17, 1, 70, 512]",
+        "name" : "cache_last_channel",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 17 × 1 × 512 × 8)",
+        "shortDescription" : "",
+        "shape" : "[17, 1, 512, 8]",
+        "name" : "cache_last_time",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "cache_last_channel_len",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "streaming_encoder",
+    "method" : "predict"
+  }
+]

1280ms/streaming_encoder.mlmodelc/model.mil ADDED Viewed

The diff for this file is too large to render. See raw diff

1280ms/streaming_encoder.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c71acb590ceb2af449de5c7e3516e76057eaf4589d1f16edba774831db74b17
+size 213179200

1280ms/streaming_encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9181d223091aa592cb656d49346e640a38ec2426de5ec2d06edbc14e92b8968b
+size 508252

1280ms/streaming_encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c71acb590ceb2af449de5c7e3516e76057eaf4589d1f16edba774831db74b17
+size 213179200

1280ms/streaming_encoder.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "468B5E19-6BA9-478C-8D2A-23953ACBD5E3": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "F22DE286-FE1A-4BBD-A7ED-B0130595DAF3": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        }
+    },
+    "rootModelIdentifier": "468B5E19-6BA9-478C-8D2A-23953ACBD5E3"
+}

1280ms/vocab.json ADDED Viewed

	@@ -0,0 +1,1028 @@

+{
+  "0": "<unk>",
+  "1": "▁t",
+  "2": "▁th",
+  "3": "▁a",
+  "4": "▁i",
+  "5": "▁the",
+  "6": "▁s",
+  "7": "re",
+  "8": "▁w",
+  "9": "▁o",
+  "10": "in",
+  "11": "at",
+  "12": "er",
+  "13": "nd",
+  "14": "ou",
+  "15": "▁c",
+  "16": "▁b",
+  "17": "▁h",
+  "18": "en",
+  "19": "on",
+  "20": "▁m",
+  "21": "▁f",
+  "22": "ing",
+  "23": "▁p",
+  "24": "▁to",
+  "25": "▁and",
+  "26": "▁d",
+  "27": "an",
+  "28": "or",
+  "29": "es",
+  "30": "▁y",
+  "31": "▁l",
+  "32": "▁of",
+  "33": "ll",
+  "34": "▁in",
+  "35": "ed",
+  "36": "it",
+  "37": "▁g",
+  "38": "is",
+  "39": "▁you",
+  "40": "▁n",
+  "41": "ar",
+  "42": "om",
+  "43": "as",
+  "44": "ve",
+  "45": "▁e",
+  "46": "ic",
+  "47": "▁it",
+  "48": "al",
+  "49": "us",
+  "50": "▁wh",
+  "51": "▁we",
+  "52": "▁be",
+  "53": "ion",
+  "54": "ow",
+  "55": "le",
+  "56": "▁is",
+  "57": "et",
+  "58": "ent",
+  "59": "ot",
+  "60": "ut",
+  "61": "▁re",
+  "62": "▁on",
+  "63": "ay",
+  "64": "▁ha",
+  "65": "ig",
+  "66": "▁so",
+  "67": "ct",
+  "68": "▁he",
+  "69": "▁for",
+  "70": "ver",
+  "71": "ke",
+  "72": "ro",
+  "73": "▁st",
+  "74": "id",
+  "75": "▁go",
+  "76": "all",
+  "77": "se",
+  "78": "ly",
+  "79": "▁u",
+  "80": "ch",
+  "81": "st",
+  "82": "ld",
+  "83": "▁k",
+  "84": "ce",
+  "85": "ur",
+  "86": "▁li",
+  "87": "am",
+  "88": "▁r",
+  "89": "ht",
+  "90": "▁j",
+  "91": "ith",
+  "92": "▁se",
+  "93": "ir",
+  "94": "▁as",
+  "95": "▁an",
+  "96": "im",
+  "97": "▁do",
+  "98": "ad",
+  "99": "▁was",
+  "100": "ight",
+  "101": "th",
+  "102": "▁are",
+  "103": "▁but",
+  "104": "▁sh",
+  "105": "ust",
+  "106": "ally",
+  "107": "▁not",
+  "108": "▁or",
+  "109": "▁com",
+  "110": "▁can",
+  "111": "▁me",
+  "112": "op",
+  "113": "▁mo",
+  "114": "▁at",
+  "115": "ill",
+  "116": "▁ch",
+  "117": "▁ne",
+  "118": "ant",
+  "119": "▁de",
+  "120": "▁kn",
+  "121": "▁one",
+  "122": "il",
+  "123": "ol",
+  "124": "▁con",
+  "125": "ter",
+  "126": "▁ab",
+  "127": "▁fr",
+  "128": "ere",
+  "129": "ck",
+  "130": "▁al",
+  "131": "▁all",
+  "132": "qu",
+  "133": "▁pro",
+  "134": "▁som",
+  "135": "ould",
+  "136": "▁tw",
+  "137": "ul",
+  "138": "ra",
+  "139": "od",
+  "140": "ers",
+  "141": "▁su",
+  "142": "ive",
+  "143": "▁v",
+  "144": "use",
+  "145": "ate",
+  "146": "ge",
+  "147": "if",
+  "148": "▁ex",
+  "149": "ess",
+  "150": "pp",
+  "151": "▁lo",
+  "152": "out",
+  "153": "▁if",
+  "154": "est",
+  "155": "ain",
+  "156": "ist",
+  "157": "and",
+  "158": "ea",
+  "159": "very",
+  "160": "art",
+  "161": "▁wor",
+  "162": "▁my",
+  "163": "ab",
+  "164": "ment",
+  "165": "▁bec",
+  "166": "un",
+  "167": "ity",
+  "168": "ri",
+  "169": "pe",
+  "170": "ions",
+  "171": "▁by",
+  "172": "ok",
+  "173": "our",
+  "174": "ort",
+  "175": "ind",
+  "176": "ink",
+  "177": "nt",
+  "178": "▁up",
+  "179": "um",
+  "180": "▁don",
+  "181": "▁get",
+  "182": "red",
+  "183": "▁out",
+  "184": "el",
+  "185": "ause",
+  "186": "res",
+  "187": "▁ma",
+  "188": "ich",
+  "189": "▁us",
+  "190": "rou",
+  "191": "▁int",
+  "192": "em",
+  "193": "os",
+  "194": "ies",
+  "195": "ie",
+  "196": "▁pl",
+  "197": "▁tr",
+  "198": "ven",
+  "199": "ous",
+  "200": "▁le",
+  "201": "▁two",
+  "202": "ard",
+  "203": "ine",
+  "204": "▁co",
+  "205": "een",
+  "206": "▁now",
+  "207": "ty",
+  "208": "her",
+  "209": "ack",
+  "210": "▁pe",
+  "211": "ame",
+  "212": "▁how",
+  "213": "▁who",
+  "214": "▁see",
+  "215": "▁tim",
+  "216": "ect",
+  "217": "ast",
+  "218": "▁our",
+  "219": "ci",
+  "220": "ree",
+  "221": "ople",
+  "222": "gh",
+  "223": "▁no",
+  "224": "▁had",
+  "225": "▁man",
+  "226": "▁qu",
+  "227": "▁en",
+  "228": "ide",
+  "229": "ure",
+  "230": "ud",
+  "231": "so",
+  "232": "▁his",
+  "233": "▁sa",
+  "234": "▁sp",
+  "235": "▁say",
+  "236": "ose",
+  "237": "ther",
+  "238": "▁act",
+  "239": "▁ta",
+  "240": "▁cl",
+  "241": "ings",
+  "242": "pt",
+  "243": "king",
+  "244": "▁any",
+  "245": "▁has",
+  "246": "▁un",
+  "247": "iv",
+  "248": "▁im",
+  "249": "▁ag",
+  "250": "▁te",
+  "251": "▁fe",
+  "252": "one",
+  "253": "per",
+  "254": "ong",
+  "255": "▁po",
+  "256": "▁ad",
+  "257": "ff",
+  "258": "ore",
+  "259": "itt",
+  "260": "ans",
+  "261": "iz",
+  "262": "eah",
+  "263": "reat",
+  "264": "act",
+  "265": "own",
+  "266": "hing",
+  "267": "enty",
+  "268": "age",
+  "269": "ber",
+  "270": "ice",
+  "271": "▁am",
+  "272": "ple",
+  "273": "are",
+  "274": "▁per",
+  "275": "und",
+  "276": "ite",
+  "277": "ix",
+  "278": "pl",
+  "279": "▁way",
+  "280": "▁did",
+  "281": "▁pr",
+  "282": "▁got",
+  "283": "ars",
+  "284": "▁she",
+  "285": "▁let",
+  "286": "ag",
+  "287": "▁ac",
+  "288": "int",
+  "289": "▁ar",
+  "290": "ry",
+  "291": "ign",
+  "292": "ish",
+  "293": "▁fir",
+  "294": "ace",
+  "295": "ble",
+  "296": "og",
+  "297": "ue",
+  "298": "▁ye",
+  "299": "ap",
+  "300": "iff",
+  "301": "▁ro",
+  "302": "▁her",
+  "303": "nder",
+  "304": "▁ok",
+  "305": "▁res",
+  "306": "▁gu",
+  "307": "ence",
+  "308": "▁may",
+  "309": "ated",
+  "310": "ip",
+  "311": "▁bo",
+  "312": "▁him",
+  "313": "way",
+  "314": "ac",
+  "315": "ical",
+  "316": "ass",
+  "317": "ase",
+  "318": "▁dis",
+  "319": "able",
+  "320": "ick",
+  "321": "▁app",
+  "322": "ance",
+  "323": "▁pre",
+  "324": "▁six",
+  "325": "▁off",
+  "326": "▁new",
+  "327": "ia",
+  "328": "orm",
+  "329": "ank",
+  "330": "▁lot",
+  "331": "ach",
+  "332": "▁fo",
+  "333": "inet",
+  "334": "ire",
+  "335": "ary",
+  "336": "ult",
+  "337": "▁tal",
+  "338": "▁mu",
+  "339": "▁bl",
+  "340": "ount",
+  "341": "sel",
+  "342": "vel",
+  "343": "▁br",
+  "344": "▁imp",
+  "345": "ep",
+  "346": "cess",
+  "347": "ord",
+  "348": "▁sc",
+  "349": "▁inc",
+  "350": "ound",
+  "351": "ang",
+  "352": "be",
+  "353": "ress",
+  "354": "uct",
+  "355": "▁ind",
+  "356": "▁af",
+  "357": "ving",
+  "358": "▁oh",
+  "359": "▁bet",
+  "360": "▁use",
+  "361": "ome",
+  "362": "ens",
+  "363": "ys",
+  "364": "▁bu",
+  "365": "co",
+  "366": "ory",
+  "367": "ater",
+  "368": "ild",
+  "369": "ght",
+  "370": "ial",
+  "371": "▁day",
+  "372": "ning",
+  "373": "na",
+  "374": "ile",
+  "375": "▁spe",
+  "376": "▁mar",
+  "377": "ody",
+  "378": "ough",
+  "379": "ade",
+  "380": "vers",
+  "381": "xt",
+  "382": "▁fl",
+  "383": "▁ke",
+  "384": "ian",
+  "385": "▁sy",
+  "386": "▁put",
+  "387": "fore",
+  "388": "ub",
+  "389": "▁ph",
+  "390": "fe",
+  "391": "▁em",
+  "392": "▁ser",
+  "393": "form",
+  "394": "ting",
+  "395": "te",
+  "396": "av",
+  "397": "ious",
+  "398": "▁rec",
+  "399": "ks",
+  "400": "▁gr",
+  "401": "ces",
+  "402": "wn",
+  "403": "ors",
+  "404": "▁jo",
+  "405": "ents",
+  "406": "▁des",
+  "407": "▁try",
+  "408": "▁equ",
+  "409": "▁z",
+  "410": "▁rem",
+  "411": "▁str",
+  "412": "self",
+  "413": "▁bit",
+  "414": "ph",
+  "415": "ved",
+  "416": "▁why",
+  "417": "▁bas",
+  "418": "▁hel",
+  "419": "▁rel",
+  "420": "ath",
+  "421": "ject",
+  "422": "ail",
+  "423": "▁la",
+  "424": "ual",
+  "425": "▁god",
+  "426": "▁nat",
+  "427": "erm",
+  "428": "day",
+  "429": "▁id",
+  "430": "ft",
+  "431": "▁wr",
+  "432": "▁min",
+  "433": "ates",
+  "434": "▁gen",
+  "435": "tain",
+  "436": "▁ob",
+  "437": "ull",
+  "438": "ict",
+  "439": "▁tra",
+  "440": "▁end",
+  "441": "▁hig",
+  "442": "▁fif",
+  "443": "oth",
+  "444": "tern",
+  "445": "▁its",
+  "446": "vent",
+  "447": "▁sm",
+  "448": "ons",
+  "449": "▁add",
+  "450": "iss",
+  "451": "▁bel",
+  "452": "ful",
+  "453": "get",
+  "454": "▁ele",
+  "455": "▁rep",
+  "456": "ak",
+  "457": "▁ho",
+  "458": "▁pos",
+  "459": "▁num",
+  "460": "ange",
+  "461": "ves",
+  "462": "ific",
+  "463": "urn",
+  "464": "ise",
+  "465": "▁cr",
+  "466": "▁um",
+  "467": "ward",
+  "468": "▁reg",
+  "469": "ady",
+  "470": "ower",
+  "471": "uc",
+  "472": "▁dec",
+  "473": "lic",
+  "474": "▁set",
+  "475": "▁gon",
+  "476": "▁op",
+  "477": "▁ear",
+  "478": "▁sub",
+  "479": "▁sl",
+  "480": "les",
+  "481": "stem",
+  "482": "cial",
+  "483": "olog",
+  "484": "atch",
+  "485": "ily",
+  "486": "body",
+  "487": "nds",
+  "488": "ular",
+  "489": "ren",
+  "490": "▁own",
+  "491": "▁too",
+  "492": "cent",
+  "493": "ible",
+  "494": "pect",
+  "495": "ered",
+  "496": "ways",
+  "497": "teen",
+  "498": "▁uh",
+  "499": "▁big",
+  "500": "▁mod",
+  "501": "▁att",
+  "502": "▁car",
+  "503": "gr",
+  "504": "▁acc",
+  "505": "ied",
+  "506": "mun",
+  "507": "ib",
+  "508": "▁mon",
+  "509": "▁sch",
+  "510": "▁pol",
+  "511": "▁dat",
+  "512": "▁fin",
+  "513": "▁sim",
+  "514": "▁inv",
+  "515": "▁def",
+  "516": "ked",
+  "517": "▁ent",
+  "518": "▁yes",
+  "519": "ows",
+  "520": "ics",
+  "521": "ited",
+  "522": "ute",
+  "523": "ism",
+  "524": "ps",
+  "525": "▁ed",
+  "526": "▁el",
+  "527": "ably",
+  "528": "ppen",
+  "529": "als",
+  "530": "▁ten",
+  "531": "ract",
+  "532": "ss",
+  "533": "▁ass",
+  "534": "▁met",
+  "535": "gan",
+  "536": "▁eng",
+  "537": "▁stu",
+  "538": "ween",
+  "539": "arch",
+  "540": "▁gl",
+  "541": "▁cor",
+  "542": "▁dr",
+  "543": "vern",
+  "544": "▁ty",
+  "545": "▁run",
+  "546": "hip",
+  "547": "cus",
+  "548": "cond",
+  "549": "▁ins",
+  "550": "irty",
+  "551": "▁pub",
+  "552": "lud",
+  "553": "llow",
+  "554": "▁cou",
+  "555": "ew",
+  "556": "iew",
+  "557": "▁sur",
+  "558": "ero",
+  "559": "ood",
+  "560": "ness",
+  "561": "▁fun",
+  "562": "▁eff",
+  "563": "cept",
+  "564": "▁ca",
+  "565": "▁exp",
+  "566": "duct",
+  "567": "▁sw",
+  "568": "ize",
+  "569": "ope",
+  "570": "▁par",
+  "571": "kes",
+  "572": "cy",
+  "573": "▁ev",
+  "574": "▁ref",
+  "575": "ell",
+  "576": "▁bus",
+  "577": "ug",
+  "578": "rib",
+  "579": "▁cur",
+  "580": "mo",
+  "581": "ock",
+  "582": "ures",
+  "583": "air",
+  "584": "▁war",
+  "585": "str",
+  "586": "▁med",
+  "587": "▁wa",
+  "588": "▁val",
+  "589": "▁sin",
+  "590": "blem",
+  "591": "▁fam",
+  "592": "li",
+  "593": "▁far",
+  "594": "▁cle",
+  "595": "▁col",
+  "596": "mon",
+  "597": "▁gra",
+  "598": "led",
+  "599": "ense",
+  "600": "tin",
+  "601": "ues",
+  "602": "its",
+  "603": "▁mem",
+  "604": "▁inf",
+  "605": "▁eas",
+  "606": "ideo",
+  "607": "▁top",
+  "608": "io",
+  "609": "pan",
+  "610": "▁hum",
+  "611": "▁old",
+  "612": "ead",
+  "613": "▁ord",
+  "614": "ric",
+  "615": "ants",
+  "616": "oy",
+  "617": "esn",
+  "618": "uck",
+  "619": "ason",
+  "620": "ced",
+  "621": "ool",
+  "622": "rat",
+  "623": "ouse",
+  "624": "▁lar",
+  "625": "▁art",
+  "626": "▁wee",
+  "627": "▁cer",
+  "628": "ized",
+  "629": "▁mat",
+  "630": "con",
+  "631": "erg",
+  "632": "land",
+  "633": "ines",
+  "634": "▁chr",
+  "635": "▁aut",
+  "636": "▁lea",
+  "637": "▁sou",
+  "638": "oney",
+  "639": "tty",
+  "640": "▁ple",
+  "641": "ulat",
+  "642": "oks",
+  "643": "▁few",
+  "644": "▁sol",
+  "645": "▁che",
+  "646": "chn",
+  "647": "ird",
+  "648": "▁bre",
+  "649": "▁dur",
+  "650": "▁wom",
+  "651": "me",
+  "652": "izat",
+  "653": "eric",
+  "654": "ote",
+  "655": "▁uni",
+  "656": "eren",
+  "657": "arn",
+  "658": "ross",
+  "659": "ices",
+  "660": "ten",
+  "661": "eral",
+  "662": "ever",
+  "663": "ieve",
+  "664": "lish",
+  "665": "ash",
+  "666": "▁opp",
+  "667": "alth",
+  "668": "ger",
+  "669": "▁sk",
+  "670": "▁red",
+  "671": "peri",
+  "672": "▁det",
+  "673": "▁ext",
+  "674": "ner",
+  "675": "ah",
+  "676": "▁var",
+  "677": "▁loc",
+  "678": "gram",
+  "679": "ists",
+  "680": "ives",
+  "681": "▁es",
+  "682": "▁nor",
+  "683": "tro",
+  "684": "ale",
+  "685": "▁iss",
+  "686": "▁pri",
+  "687": "gin",
+  "688": "az",
+  "689": "oc",
+  "690": "▁pop",
+  "691": "ern",
+  "692": "▁sit",
+  "693": "ket",
+  "694": "▁pa",
+  "695": "▁law",
+  "696": "ages",
+  "697": "br",
+  "698": "▁cam",
+  "699": "▁mom",
+  "700": "osed",
+  "701": "▁bro",
+  "702": "ne",
+  "703": "bs",
+  "704": "▁cre",
+  "705": "erat",
+  "706": "▁sec",
+  "707": "▁cap",
+  "708": "▁vis",
+  "709": "▁pat",
+  "710": "ield",
+  "711": "iet",
+  "712": "▁tri",
+  "713": "up",
+  "714": "▁bra",
+  "715": "ts",
+  "716": "▁mot",
+  "717": "▁unt",
+  "718": "put",
+  "719": "bo",
+  "720": "ork",
+  "721": "mer",
+  "722": "ital",
+  "723": "▁air",
+  "724": "ined",
+  "725": "▁beh",
+  "726": "▁adv",
+  "727": "▁ret",
+  "728": "imes",
+  "729": "▁tea",
+  "730": "ural",
+  "731": "sid",
+  "732": "ters",
+  "733": "▁pur",
+  "734": "▁sci",
+  "735": "bers",
+  "736": "ient",
+  "737": "ier",
+  "738": "cc",
+  "739": "sw",
+  "740": "▁av",
+  "741": "reen",
+  "742": "ode",
+  "743": "ont",
+  "744": "▁dra",
+  "745": "ann",
+  "746": "nect",
+  "747": "▁x",
+  "748": "▁eu",
+  "749": "ton",
+  "750": "inat",
+  "751": "ene",
+  "752": "ared",
+  "753": "els",
+  "754": "▁mor",
+  "755": "▁rat",
+  "756": "cri",
+  "757": "▁men",
+  "758": "▁ah",
+  "759": "ames",
+  "760": "▁arm",
+  "761": "eak",
+  "762": "▁pay",
+  "763": "▁hal",
+  "764": "ins",
+  "765": "ilit",
+  "766": "stit",
+  "767": "▁ra",
+  "768": "▁leg",
+  "769": "cl",
+  "770": "pr",
+  "771": "▁wal",
+  "772": "▁bad",
+  "773": "▁ge",
+  "774": "roup",
+  "775": "▁mus",
+  "776": "man",
+  "777": "▁gi",
+  "778": "eds",
+  "779": "▁aw",
+  "780": "po",
+  "781": "ark",
+  "782": "row",
+  "783": "▁dep",
+  "784": "ully",
+  "785": "ral",
+  "786": "lect",
+  "787": "pend",
+  "788": "▁sev",
+  "789": "ime",
+  "790": "gest",
+  "791": "here",
+  "792": "▁yet",
+  "793": "ted",
+  "794": "▁rev",
+  "795": "ds",
+  "796": "▁ask",
+  "797": "less",
+  "798": "▁di",
+  "799": "ets",
+  "800": "line",
+  "801": "▁aff",
+  "802": "ired",
+  "803": "▁est",
+  "804": "ken",
+  "805": "vid",
+  "806": "most",
+  "807": "ivid",
+  "808": "unch",
+  "809": "par",
+  "810": "med",
+  "811": "rop",
+  "812": "ased",
+  "813": "eone",
+  "814": "▁ve",
+  "815": "▁abs",
+  "816": "ergy",
+  "817": "ret",
+  "818": "▁saw",
+  "819": "▁ey",
+  "820": "▁cal",
+  "821": "uat",
+  "822": "▁mid",
+  "823": "vat",
+  "824": "ream",
+  "825": "vice",
+  "826": "ians",
+  "827": "rent",
+  "828": "ctor",
+  "829": "err",
+  "830": "ush",
+  "831": "ases",
+  "832": "▁suc",
+  "833": "erms",
+  "834": "ave",
+  "835": "angu",
+  "836": "ries",
+  "837": "▁wo",
+  "838": "arts",
+  "839": "▁fil",
+  "840": "▁fat",
+  "841": "▁cho",
+  "842": "orts",
+  "843": "▁fre",
+  "844": "ee",
+  "845": "ught",
+  "846": "eng",
+  "847": "ump",
+  "848": "▁bar",
+  "849": "ying",
+  "850": "ane",
+  "851": "▁tem",
+  "852": "anks",
+  "853": "ury",
+  "854": "iat",
+  "855": "mit",
+  "856": "trol",
+  "857": "▁net",
+  "858": "▁maj",
+  "859": "▁cra",
+  "860": "ling",
+  "861": "▁fig",
+  "862": "orn",
+  "863": "icat",
+  "864": "pany",
+  "865": "▁occ",
+  "866": "ott",
+  "867": "ands",
+  "868": "▁exc",
+  "869": "▁mr",
+  "870": "ency",
+  "871": "rope",
+  "872": "itch",
+  "873": "▁lit",
+  "874": "abil",
+  "875": "not",
+  "876": "ma",
+  "877": "▁typ",
+  "878": "▁opt",
+  "879": "ob",
+  "880": "ser",
+  "881": "ety",
+  "882": "ms",
+  "883": "peci",
+  "884": "aces",
+  "885": "aut",
+  "886": "▁hon",
+  "887": "cuss",
+  "888": "▁sal",
+  "889": "▁sor",
+  "890": "att",
+  "891": "▁lab",
+  "892": "▁har",
+  "893": "urch",
+  "894": "nded",
+  "895": "uce",
+  "896": "ids",
+  "897": "▁hy",
+  "898": "▁fut",
+  "899": "▁ste",
+  "900": "ours",
+  "901": "ems",
+  "902": "utes",
+  "903": "ng",
+  "904": "ta",
+  "905": "▁won",
+  "906": "▁fa",
+  "907": "▁env",
+  "908": "ards",
+  "909": "▁job",
+  "910": "ium",
+  "911": "▁dot",
+  "912": "▁obv",
+  "913": "ina",
+  "914": "side",
+  "915": "elve",
+  "916": "cu",
+  "917": "▁jes",
+  "918": "▁pot",
+  "919": "▁pie",
+  "920": "▁tre",
+  "921": "▁hey",
+  "922": "▁mag",
+  "923": "ron",
+  "924": "▁key",
+  "925": "swer",
+  "926": "▁win",
+  "927": "ucat",
+  "928": "work",
+  "929": "ides",
+  "930": "▁low",
+  "931": "▁vol",
+  "932": "▁oth",
+  "933": "atic",
+  "934": "lf",
+  "935": "ads",
+  "936": "inds",
+  "937": "com",
+  "938": "ths",
+  "939": "▁ver",
+  "940": "ised",
+  "941": "lo",
+  "942": "▁squ",
+  "943": "▁cut",
+  "944": "oked",
+  "945": "irit",
+  "946": "ateg",
+  "947": "ppy",
+  "948": "mitt",
+  "949": "come",
+  "950": "hn",
+  "951": "igin",
+  "952": "mand",
+  "953": "▁dam",
+  "954": "ho",
+  "955": "▁da",
+  "956": "▁fur",
+  "957": "iron",
+  "958": "ilar",
+  "959": "▁fac",
+  "960": "▁neg",
+  "961": "▁ago",
+  "962": "ged",
+  "963": "miss",
+  "964": "enth",
+  "965": "▁dou",
+  "966": "▁hit",
+  "967": "▁guy",
+  "968": "▁bi",
+  "969": "ove",
+  "970": "fess",
+  "971": "ples",
+  "972": "owed",
+  "973": "ured",
+  "974": "▁ris",
+  "975": "ints",
+  "976": "rew",
+  "977": "▁sum",
+  "978": "▁hu",
+  "979": "ploy",
+  "980": "ude",
+  "981": "ried",
+  "982": "▁cir",
+  "983": "▁dev",
+  "984": "ear",
+  "985": "▁tot",
+  "986": "▁ann",
+  "987": "duc",
+  "988": "ik",
+  "989": "pon",
+  "990": "sted",
+  "991": "▁ide",
+  "992": "▁'",
+  "993": "ipp",
+  "994": "▁eat",
+  "995": "▁dom",
+  "996": "▁",
+  "997": "e",
+  "998": "t",
+  "999": "o",
+  "1000": "a",
+  "1001": "i",
+  "1002": "n",
+  "1003": "s",
+  "1004": "r",
+  "1005": "h",
+  "1006": "l",
+  "1007": "d",
+  "1008": "u",
+  "1009": "c",
+  "1010": "m",
+  "1011": "y",
+  "1012": "g",
+  "1013": "w",
+  "1014": "f",
+  "1015": "p",
+  "1016": "b",
+  "1017": "v",
+  "1018": "k",
+  "1019": "'",
+  "1020": "j",
+  "1021": "x",
+  "1022": "q",
+  "1023": "z",
+  "1024": "<EOU>",
+  "1025": "<EOB>"
+}

160ms/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

160ms/convert_parakeet_eou.py ADDED Viewed

	@@ -0,0 +1,740 @@

+#!/usr/bin/env python3
+"""CLI for exporting Parakeet Realtime EOU 120M components to CoreML.
+This model is a cache-aware streaming FastConformer-RNNT model optimized for
+low-latency speech recognition with end-of-utterance detection.
+Key differences from Parakeet TDT v3:
+- Smaller model (120M vs 600M params)
+- No duration outputs (standard RNNT, not TDT)
+- Cache-aware streaming encoder (17 layers, attention context [70,1])
+- Special <EOU> token for end-of-utterance detection
+- Optimized for 80-160ms latency
+Reference: https://huggingface.co/nvidia/parakeet_realtime_eou_120m-v1
+"""
+from __future__ import annotations
+import json
+from dataclasses import asdict
+from pathlib import Path
+from typing import Dict, Optional, Tuple
+import coremltools as ct
+import numpy as np
+import soundfile as sf
+import torch
+import typer
+import nemo.collections.asr as nemo_asr
+from individual_components import (
+    DecoderWrapper,
+    EncoderWrapper,
+    ExportSettings,
+    JointWrapper,
+    JointDecisionWrapper,
+    JointDecisionSingleStep,
+    PreprocessorWrapper,
+    MelEncoderWrapper,
+    _coreml_convert,
+)
+def apply_stft_patch():
+    # Monkey patch coremltools.stft to handle extra arguments from newer torch versions
+    try:
+        import coremltools.converters.mil.frontend.torch.ops as torch_ops
+        _original_stft = torch_ops.stft
+        def patched_stft(context, node):
+            if len(node.inputs) > 8:
+                node.inputs = node.inputs[:8]
+            return _original_stft(context, node)
+        torch_ops.stft = patched_stft
+        if "stft" in torch_ops._TORCH_OPS_REGISTRY:
+            torch_ops._TORCH_OPS_REGISTRY["stft"] = patched_stft
+        print("Monkey patched coremltools.stft for compatibility.")
+    except Exception as e:
+        print(f"Warning: Could not monkey patch stft: {e}")
+DEFAULT_MODEL_ID = "nvidia/parakeet_realtime_eou_120m-v1"
+AUTHOR = "Fluid Inference"
+def _compute_length(seconds: float, sample_rate: int) -> int:
+    return int(round(seconds * sample_rate))
+def _prepare_audio(
+    validation_audio: Optional[Path],
+    sample_rate: int,
+    max_samples: int,
+    seed: Optional[int],
+) -> torch.Tensor:
+    if validation_audio is None:
+        if seed is not None:
+            torch.manual_seed(seed)
+        audio = torch.randn(1, max_samples, dtype=torch.float32)
+        return audio
+    data, sr = sf.read(str(validation_audio), dtype="float32")
+    if sr != sample_rate:
+        raise typer.BadParameter(
+            f"Validation audio sample rate {sr} does not match model rate {sample_rate}"
+        )
+    if data.ndim > 1:
+        data = data[:, 0]
+    if data.size == 0:
+        raise typer.BadParameter("Validation audio is empty")
+    if data.size < max_samples:
+        pad_width = max_samples - data.size
+        data = np.pad(data, (0, pad_width))
+    elif data.size > max_samples:
+        data = data[:max_samples]
+    audio = torch.from_numpy(data).unsqueeze(0).to(dtype=torch.float32)
+    return audio
+def _save_mlpackage(model: ct.models.MLModel, path: Path, description: str) -> None:
+    try:
+        model.minimum_deployment_target = ct.target.iOS17
+    except Exception:
+        pass
+    model.short_description = description
+    model.author = AUTHOR
+    path.parent.mkdir(parents=True, exist_ok=True)
+    model.save(str(path))
+def _tensor_shape(tensor: torch.Tensor) -> Tuple[int, ...]:
+    return tuple(int(dim) for dim in tensor.shape)
+def _parse_compute_units(name: str) -> ct.ComputeUnit:
+    """Parse a human-friendly compute units string into ct.ComputeUnit."""
+    normalized = str(name).strip().upper()
+    mapping = {
+        "ALL": ct.ComputeUnit.ALL,
+        "CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
+        "CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
+        "CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
+        "CPU_AND_NEURALENGINE": ct.ComputeUnit.CPU_AND_NE,
+    }
+    if normalized not in mapping:
+        raise typer.BadParameter(
+            f"Unknown compute units '{name}'. Choose from: " + ", ".join(mapping.keys())
+        )
+    return mapping[normalized]
+def _parse_compute_precision(name: Optional[str]) -> Optional[ct.precision]:
+    """Parse compute precision string into ct.precision or None."""
+    if name is None:
+        return None
+    normalized = str(name).strip().upper()
+    if normalized == "":
+        return None
+    mapping = {
+        "FLOAT32": ct.precision.FLOAT32,
+        "FLOAT16": ct.precision.FLOAT16,
+    }
+    if normalized not in mapping:
+        raise typer.BadParameter(
+            f"Unknown compute precision '{name}'. Choose from: "
+            + ", ".join(mapping.keys())
+        )
+    return mapping[normalized]
+app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
+@app.command()
+def convert(
+    nemo_path: Optional[Path] = typer.Option(
+        None,
+        "--nemo-path",
+        exists=True,
+        resolve_path=True,
+        help="Path to parakeet_realtime_eou_120m-v1.nemo checkpoint (skip to auto-download)",
+    ),
+    model_id: str = typer.Option(
+        DEFAULT_MODEL_ID,
+        "--model-id",
+        help="Model identifier to download when --nemo-path is omitted",
+    ),
+    output_dir: Path = typer.Option(
+        Path("parakeet_eou_coreml"),
+        help="Directory where mlpackages and metadata will be written",
+    ),
+    preprocessor_cu: str = typer.Option(
+        "CPU_ONLY",
+        "--preprocessor-cu",
+        help="Compute units for preprocessor (default CPU_ONLY)",
+    ),
+    mel_encoder_cu: str = typer.Option(
+        "CPU_ONLY",
+        "--mel-encoder-cu",
+        help="Compute units for fused mel+encoder (default CPU_ONLY)",
+    ),
+    compute_precision: Optional[str] = typer.Option(
+        None,
+        "--compute-precision",
+        help="Export precision: FLOAT32 (default) or FLOAT16 to shrink non-quantized weights.",
+    ),
+    max_audio_seconds: float = typer.Option(
+        15.0,
+        "--max-audio-seconds",
+        help="Maximum audio duration in seconds for the fixed window export",
+    ),
+    validation_audio: Optional[Path] = typer.Option(
+        None,
+        "--validation-audio",
+        exists=True,
+        resolve_path=True,
+        help="Path to a 16kHz WAV file for tracing (uses random if not provided)",
+    ),
+) -> None:
+    """Export all Parakeet Realtime EOU sub-modules to CoreML.
+    This exports the cache-aware streaming FastConformer-RNNT model for
+    low-latency speech recognition with end-of-utterance detection.
+    """
+    export_settings = ExportSettings(
+        output_dir=output_dir,
+        compute_units=ct.ComputeUnit.CPU_ONLY,
+        deployment_target=ct.target.iOS17,
+        compute_precision=_parse_compute_precision(compute_precision),
+        max_audio_seconds=max_audio_seconds,
+        max_symbol_steps=1,
+    )
+    typer.echo("Export configuration:")
+    typer.echo(asdict(export_settings))
+    output_dir.mkdir(parents=True, exist_ok=True)
+    pre_cu = _parse_compute_units(preprocessor_cu)
+    melenc_cu = _parse_compute_units(mel_encoder_cu)
+    if nemo_path is not None:
+        typer.echo(f"Loading NeMo model from {nemo_path}…")
+        # Try loading as generic ASRModel first, then specific class
+        try:
+            asr_model = nemo_asr.models.ASRModel.restore_from(
+                str(nemo_path), map_location="cpu"
+            )
+        except Exception:
+            # Fallback to EncDecRNNTBPEModel
+            asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
+                str(nemo_path), map_location="cpu"
+            )
+        checkpoint_meta = {
+            "type": "file",
+            "path": str(nemo_path),
+        }
+    else:
+        typer.echo(f"Downloading NeMo model via {model_id}…")
+        # Use ASRModel.from_pretrained as recommended for this model
+        try:
+            asr_model = nemo_asr.models.ASRModel.from_pretrained(
+                model_id, map_location="cpu"
+            )
+        except Exception:
+            # Fallback to EncDecRNNTBPEModel
+            asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
+                model_id, map_location="cpu"
+            )
+        checkpoint_meta = {
+            "type": "pretrained",
+            "model_id": model_id,
+        }
+    asr_model.eval()
+    # Print model info
+    typer.echo(f"Model class: {type(asr_model).__name__}")
+    typer.echo(f"Encoder class: {type(asr_model.encoder).__name__}")
+    sample_rate = int(asr_model.cfg.preprocessor.sample_rate)
+    max_samples = _compute_length(export_settings.max_audio_seconds, sample_rate)
+    # Prepare audio for tracing
+    if validation_audio is not None:
+        typer.echo(f"Using validation audio: {validation_audio}")
+        audio_tensor = _prepare_audio(validation_audio, sample_rate, max_samples, seed=None)
+    else:
+        typer.echo("Using random audio for tracing (seed=42)")
+        audio_tensor = _prepare_audio(None, sample_rate, max_samples, seed=42)
+    audio_length = torch.tensor([max_samples], dtype=torch.int32)
+    preprocessor = PreprocessorWrapper(asr_model.preprocessor.eval())
+    encoder = EncoderWrapper(asr_model.encoder.eval())
+    decoder = DecoderWrapper(asr_model.decoder.eval())
+    joint = JointWrapper(asr_model.joint.eval())
+    decoder_export_flag = getattr(asr_model.decoder, "_rnnt_export", False)
+    asr_model.decoder._rnnt_export = True
+    try:
+        with torch.no_grad():
+            mel_ref, mel_length_ref = preprocessor(audio_tensor, audio_length)
+            mel_length_ref = mel_length_ref.to(dtype=torch.int32)
+            encoder_ref, encoder_length_ref, frame_times_ref = encoder(
+                mel_ref, mel_length_ref
+            )
+            encoder_length_ref = encoder_length_ref.to(dtype=torch.int32)
+            # Clone tensors to drop inference flags
+            mel_ref = mel_ref.clone().detach()
+            mel_length_ref = mel_length_ref.clone().detach()
+            encoder_ref = encoder_ref.clone().detach()
+            encoder_length_ref = encoder_length_ref.clone().detach()
+            frame_times_ref = frame_times_ref.clone().detach()
+        vocab_size = int(asr_model.tokenizer.vocab_size)
+        decoder_hidden = int(asr_model.decoder.pred_hidden)
+        decoder_layers = int(asr_model.decoder.pred_rnn_layers)
+        # Check if model has extra outputs (TDT-style duration)
+        num_extra = getattr(asr_model.joint, "num_extra_outputs", 0)
+        typer.echo(f"Vocab size: {vocab_size}, num_extra_outputs: {num_extra}")
+        targets = torch.full(
+            (1, export_settings.max_symbol_steps),
+            fill_value=asr_model.decoder.blank_idx,
+            dtype=torch.int32,
+        )
+        target_lengths = torch.tensor(
+            [export_settings.max_symbol_steps], dtype=torch.int32
+        )
+        zero_state = torch.zeros(
+            decoder_layers,
+            1,
+            decoder_hidden,
+            dtype=torch.float32,
+        )
+        with torch.no_grad():
+            decoder_ref, h_ref, c_ref = decoder(
+                targets, target_lengths, zero_state, zero_state
+            )
+            joint_ref = joint(encoder_ref, decoder_ref)
+        decoder_ref = decoder_ref.clone()
+        h_ref = h_ref.clone()
+        c_ref = c_ref.clone()
+        joint_ref = joint_ref.clone()
+        typer.echo(f"Encoder output shape: {encoder_ref.shape}")
+        typer.echo(f"Decoder output shape: {decoder_ref.shape}")
+        typer.echo(f"Joint output shape: {joint_ref.shape}")
+        # === Export Preprocessor ===
+        typer.echo("Tracing and converting preprocessor…")
+        preprocessor = preprocessor.cpu()
+        audio_tensor = audio_tensor.cpu()
+        audio_length = audio_length.cpu()
+        traced_preprocessor = torch.jit.trace(
+            preprocessor, (audio_tensor, audio_length), strict=False
+        )
+        traced_preprocessor.eval()
+        preprocessor_inputs = [
+            ct.TensorType(
+                name="audio_signal",
+                shape=(1, ct.RangeDim(1, max_samples)),
+                dtype=np.float32,
+            ),
+            ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
+        ]
+        preprocessor_outputs = [
+            ct.TensorType(name="mel", dtype=np.float32),
+            ct.TensorType(name="mel_length", dtype=np.int32),
+        ]
+        preprocessor_model = _coreml_convert(
+            traced_preprocessor,
+            preprocessor_inputs,
+            preprocessor_outputs,
+            export_settings,
+            compute_units_override=pre_cu,
+        )
+        preprocessor_path = output_dir / "parakeet_eou_preprocessor.mlpackage"
+        _save_mlpackage(
+            preprocessor_model,
+            preprocessor_path,
+            f"Parakeet EOU preprocessor ({max_audio_seconds}s window)",
+        )
+        # === Export Encoder ===
+        typer.echo("Tracing and converting encoder…")
+        traced_encoder = torch.jit.trace(
+            encoder, (mel_ref, mel_length_ref), strict=False
+        )
+        traced_encoder.eval()
+        encoder_inputs = [
+            ct.TensorType(
+                name="mel", shape=_tensor_shape(mel_ref), dtype=np.float32
+            ),
+            ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
+        ]
+        encoder_outputs = [
+            ct.TensorType(name="encoder", dtype=np.float32),
+            ct.TensorType(name="encoder_length", dtype=np.int32),
+            ct.TensorType(name="frame_times", dtype=np.float32),
+        ]
+        encoder_model = _coreml_convert(
+            traced_encoder,
+            encoder_inputs,
+            encoder_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        encoder_path = output_dir / "parakeet_eou_encoder.mlpackage"
+        _save_mlpackage(
+            encoder_model,
+            encoder_path,
+            f"Parakeet EOU encoder ({max_audio_seconds}s window)",
+        )
+        # === Export Fused Mel+Encoder ===
+        typer.echo("Tracing and converting fused mel+encoder…")
+        mel_encoder = MelEncoderWrapper(preprocessor, encoder)
+        traced_mel_encoder = torch.jit.trace(
+            mel_encoder, (audio_tensor, audio_length), strict=False
+        )
+        traced_mel_encoder.eval()
+        mel_encoder_inputs = [
+            ct.TensorType(
+                name="audio_signal", shape=(1, max_samples), dtype=np.float32
+            ),
+            ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
+        ]
+        mel_encoder_outputs = [
+            ct.TensorType(name="encoder", dtype=np.float32),
+            ct.TensorType(name="encoder_length", dtype=np.int32),
+            ct.TensorType(name="frame_times", dtype=np.float32),
+        ]
+        mel_encoder_model = _coreml_convert(
+            traced_mel_encoder,
+            mel_encoder_inputs,
+            mel_encoder_outputs,
+            export_settings,
+            compute_units_override=melenc_cu,
+        )
+        mel_encoder_path = output_dir / "parakeet_eou_mel_encoder.mlpackage"
+        _save_mlpackage(
+            mel_encoder_model,
+            mel_encoder_path,
+            f"Parakeet EOU fused Mel+Encoder ({max_audio_seconds}s window)",
+        )
+        # === Export Decoder ===
+        typer.echo("Tracing and converting decoder…")
+        traced_decoder = torch.jit.trace(
+            decoder,
+            (targets, target_lengths, zero_state, zero_state),
+            strict=False,
+        )
+        traced_decoder.eval()
+        decoder_inputs = [
+            ct.TensorType(
+                name="targets", shape=_tensor_shape(targets), dtype=np.int32
+            ),
+            ct.TensorType(name="target_length", shape=(1,), dtype=np.int32),
+            ct.TensorType(
+                name="h_in", shape=_tensor_shape(zero_state), dtype=np.float32
+            ),
+            ct.TensorType(
+                name="c_in", shape=_tensor_shape(zero_state), dtype=np.float32
+            ),
+        ]
+        decoder_outputs = [
+            ct.TensorType(name="decoder", dtype=np.float32),
+            ct.TensorType(name="h_out", dtype=np.float32),
+            ct.TensorType(name="c_out", dtype=np.float32),
+        ]
+        decoder_model = _coreml_convert(
+            traced_decoder,
+            decoder_inputs,
+            decoder_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        decoder_path = output_dir / "parakeet_eou_decoder.mlpackage"
+        _save_mlpackage(
+            decoder_model,
+            decoder_path,
+            "Parakeet EOU decoder (RNNT prediction network)",
+        )
+        # === Export Joint ===
+        typer.echo("Tracing and converting joint…")
+        traced_joint = torch.jit.trace(
+            joint,
+            (encoder_ref, decoder_ref),
+            strict=False,
+        )
+        traced_joint.eval()
+        joint_inputs = [
+            ct.TensorType(
+                name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32
+            ),
+            ct.TensorType(
+                name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32
+            ),
+        ]
+        joint_outputs = [
+            ct.TensorType(name="logits", dtype=np.float32),
+        ]
+        joint_model = _coreml_convert(
+            traced_joint,
+            joint_inputs,
+            joint_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        joint_path = output_dir / "parakeet_eou_joint.mlpackage"
+        _save_mlpackage(
+            joint_model,
+            joint_path,
+            "Parakeet EOU joint network (RNNT)",
+        )
+        # === Export Joint Decision Head ===
+        typer.echo("Tracing and converting joint decision head…")
+        joint_decision = JointDecisionWrapper(joint, vocab_size=vocab_size)
+        traced_joint_decision = torch.jit.trace(
+            joint_decision,
+            (encoder_ref, decoder_ref),
+            strict=False,
+        )
+        traced_joint_decision.eval()
+        joint_decision_inputs = [
+            ct.TensorType(
+                name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32
+            ),
+            ct.TensorType(
+                name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32
+            ),
+        ]
+        joint_decision_outputs = [
+            ct.TensorType(name="token_id", dtype=np.int32),
+            ct.TensorType(name="token_prob", dtype=np.float32),
+        ]
+        joint_decision_model = _coreml_convert(
+            traced_joint_decision,
+            joint_decision_inputs,
+            joint_decision_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        joint_decision_path = output_dir / "parakeet_eou_joint_decision.mlpackage"
+        _save_mlpackage(
+            joint_decision_model,
+            joint_decision_path,
+            "Parakeet EOU joint + decision head (softmax, argmax)",
+        )
+        # === Export Single-Step Joint Decision ===
+        typer.echo("Tracing and converting single-step joint decision…")
+        jd_single = JointDecisionSingleStep(joint, vocab_size=vocab_size)
+        # Create single-step slices from refs
+        enc_step = encoder_ref[:, :, :1].contiguous()
+        dec_step = decoder_ref[:, :, :1].contiguous()
+        traced_jd_single = torch.jit.trace(
+            jd_single,
+            (enc_step, dec_step),
+            strict=False,
+        )
+        traced_jd_single.eval()
+        jd_single_inputs = [
+            ct.TensorType(
+                name="encoder_step",
+                shape=(1, enc_step.shape[1], 1),
+                dtype=np.float32,
+            ),
+            ct.TensorType(
+                name="decoder_step",
+                shape=(1, dec_step.shape[1], 1),
+                dtype=np.float32,
+            ),
+        ]
+        jd_single_outputs = [
+            ct.TensorType(name="token_id", dtype=np.int32),
+            ct.TensorType(name="token_prob", dtype=np.float32),
+            ct.TensorType(name="top_k_ids", dtype=np.int32),
+            ct.TensorType(name="top_k_logits", dtype=np.float32),
+        ]
+        jd_single_model = _coreml_convert(
+            traced_jd_single,
+            jd_single_inputs,
+            jd_single_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        jd_single_path = output_dir / "parakeet_eou_joint_decision_single_step.mlpackage"
+        _save_mlpackage(
+            jd_single_model,
+            jd_single_path,
+            "Parakeet EOU single-step joint decision (current frame)",
+        )
+        # === Save Metadata ===
+        metadata: Dict[str, object] = {
+            "model_id": model_id,
+            "model_name": "parakeet_realtime_eou_120m-v1",
+            "model_class": type(asr_model).__name__,
+            "encoder_class": type(asr_model.encoder).__name__,
+            "sample_rate": sample_rate,
+            "max_audio_seconds": export_settings.max_audio_seconds,
+            "max_audio_samples": max_samples,
+            "max_symbol_steps": export_settings.max_symbol_steps,
+            "vocab_size": vocab_size,
+            "vocab_with_blank": vocab_size + 1,
+            "decoder_hidden": decoder_hidden,
+            "decoder_layers": decoder_layers,
+            "num_extra_outputs": num_extra,
+            "has_eou_token": True,
+            "checkpoint": checkpoint_meta,
+            "coreml": {
+                "compute_units": export_settings.compute_units.name,
+                "compute_precision": (
+                    export_settings.compute_precision.name
+                    if export_settings.compute_precision is not None
+                    else "FLOAT32"
+                ),
+            },
+            "components": {
+                "preprocessor": {
+                    "inputs": {
+                        "audio_signal": [1, max_samples],
+                        "audio_length": [1],
+                    },
+                    "outputs": {
+                        "mel": list(_tensor_shape(mel_ref)),
+                        "mel_length": [1],
+                    },
+                    "path": preprocessor_path.name,
+                },
+                "encoder": {
+                    "inputs": {
+                        "mel": list(_tensor_shape(mel_ref)),
+                        "mel_length": [1],
+                    },
+                    "outputs": {
+                        "encoder": list(_tensor_shape(encoder_ref)),
+                        "encoder_length": [1],
+                        "frame_times": [1, _tensor_shape(encoder_ref)[2]],
+                    },
+                    "path": encoder_path.name,
+                },
+                "mel_encoder": {
+                    "inputs": {
+                        "audio_signal": [1, max_samples],
+                        "audio_length": [1],
+                    },
+                    "outputs": {
+                        "encoder": list(_tensor_shape(encoder_ref)),
+                        "encoder_length": [1],
+                        "frame_times": [1, _tensor_shape(encoder_ref)[2]],
+                    },
+                    "path": mel_encoder_path.name,
+                },
+                "decoder": {
+                    "inputs": {
+                        "targets": list(_tensor_shape(targets)),
+                        "target_length": [1],
+                        "h_in": list(_tensor_shape(zero_state)),
+                        "c_in": list(_tensor_shape(zero_state)),
+                    },
+                    "outputs": {
+                        "decoder": list(_tensor_shape(decoder_ref)),
+                        "h_out": list(_tensor_shape(h_ref)),
+                        "c_out": list(_tensor_shape(c_ref)),
+                    },
+                    "path": decoder_path.name,
+                },
+                "joint": {
+                    "inputs": {
+                        "encoder": list(_tensor_shape(encoder_ref)),
+                        "decoder": list(_tensor_shape(decoder_ref)),
+                    },
+                    "outputs": {
+                        "logits": list(_tensor_shape(joint_ref)),
+                    },
+                    "path": joint_path.name,
+                },
+                "joint_decision": {
+                    "inputs": {
+                        "encoder": list(_tensor_shape(encoder_ref)),
+                        "decoder": list(_tensor_shape(decoder_ref)),
+                    },
+                    "outputs": {
+                        "token_id": [
+                            _tensor_shape(encoder_ref)[0],
+                            _tensor_shape(encoder_ref)[2],
+                            _tensor_shape(decoder_ref)[2],
+                        ],
+                        "token_prob": [
+                            _tensor_shape(encoder_ref)[0],
+                            _tensor_shape(encoder_ref)[2],
+                            _tensor_shape(decoder_ref)[2],
+                        ],
+                    },
+                    "path": joint_decision_path.name,
+                },
+                "joint_decision_single_step": {
+                    "inputs": {
+                        "encoder_step": [1, _tensor_shape(encoder_ref)[1], 1],
+                        "decoder_step": [1, _tensor_shape(decoder_ref)[1], 1],
+                    },
+                    "outputs": {
+                        "token_id": [1, 1, 1],
+                        "token_prob": [1, 1, 1],
+                        "top_k_ids": [1, 1, 1, 64],
+                        "top_k_logits": [1, 1, 1, 64],
+                    },
+                    "path": jd_single_path.name,
+                },
+            },
+        }
+        # Export tokenizer vocab if available
+        try:
+            tokenizer = asr_model.tokenizer
+            vocab = {
+                "blank_id": int(asr_model.decoder.blank_idx),
+                "vocab_size": vocab_size,
+            }
+            # Try to get special tokens
+            if hasattr(tokenizer, "tokenizer"):
+                inner_tokenizer = tokenizer.tokenizer
+                if hasattr(inner_tokenizer, "get_vocab"):
+                    full_vocab = inner_tokenizer.get_vocab()
+                    # Find EOU token
+                    eou_token = None
+                    for token, idx in full_vocab.items():
+                        if "<EOU>" in token.upper() or "eou" in token.lower():
+                            eou_token = {"token": token, "id": idx}
+                            break
+                    if eou_token:
+                        vocab["eou_token"] = eou_token
+            metadata["tokenizer"] = vocab
+        except Exception as e:
+            typer.echo(f"Warning: Could not export tokenizer info: {e}")
+        metadata_path = output_dir / "metadata.json"
+        metadata_path.write_text(json.dumps(metadata, indent=2))
+        typer.echo(f"\nExport complete. Metadata written to {metadata_path}")
+        typer.echo(f"Output directory: {output_dir}")
+    finally:
+        asr_model.decoder._rnnt_export = decoder_export_flag
+if __name__ == "__main__":
+    app()

160ms/convert_streaming_encoder.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import torch
+import torch.nn as nn
+import coremltools as ct
+import numpy as np
+import typer
+from pathlib import Path
+from typing import Tuple, List, Optional
+import json
+import shutil
+# Iimport torch
+import coremltools as ct
+import numpy as np
+import argparse
+from nemo.collections.asr.models import EncDecRNNTBPEModel
+app = typer.Typer()
+class LoopbackEncoderWrapper(nn.Module):
+    """
+    Wraps the entire Parakeet Encoder (PreEncode + Conformer) for CoreML Loopback Streaming.
+    Inputs:
+      - audio_signal: [B, D, T] (Mel spectrogram chunk)
+      - audio_length: [B]
+      - pre_cache: [B, D, pre_cache_size] (Previous audio context)
+      - cache_last_channel: [layers, B, cache_size, hidden]
+      - cache_last_time: [layers, B, hidden, time_cache]
+      - cache_last_channel_len: [B]
+    Outputs:
+      - encoded_output: [B, D_out, T_out]
+      - encoded_length: [B]
+      - new_pre_cache: [B, D, pre_cache_size]
+      - new_cache_last_channel
+      - new_cache_last_time
+      - new_cache_last_channel_len
+    """
+    def __init__(self, encoder, pre_cache_size=16):
+        super().__init__()
+        self.encoder = encoder
+        self.pre_cache_size = pre_cache_size
+    def forward(
+        self,
+        audio_signal: torch.Tensor,
+        audio_length: torch.Tensor,
+        pre_cache: torch.Tensor,
+        cache_last_channel: torch.Tensor,
+        cache_last_time: torch.Tensor,
+        cache_last_channel_len: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # 1. Prepend pre_cache to audio_signal
+        # audio_signal: [B, D, T]
+        # pre_cache: [B, D, T_cache]
+        full_input = torch.cat([pre_cache, audio_signal], dim=2)
+        full_length = audio_length + self.pre_cache_size
+        # 2. Extract NEW pre_cache (last N frames of full_input)
+        # Note: We do this BEFORE processing because we want the raw audio context
+        new_pre_cache = full_input[:, :, -self.pre_cache_size:]
+        # 3. Process with Encoder
+        # Reconstruct NeMo cache object
+        current_cache = [cache_last_channel, cache_last_time, cache_last_channel_len]
+        encoded, encoded_len, new_cache_channel, new_cache_time, new_cache_len = self.encoder.cache_aware_stream_step(
+            processed_signal=full_input,
+            processed_signal_length=full_length,
+            cache_last_channel=cache_last_channel,
+            cache_last_time=cache_last_time,
+            cache_last_channel_len=cache_last_channel_len
+        )
+        # 4. Drop the first few frames corresponding to pre_cache?
+        # NeMo's cache_aware_stream_step usually handles the "valid" output frames.
+        # But since we manually prepended, we might get extra output frames.
+        # However, for streaming, we usually want the model to see the context but only output the new tokens.
+        # Let's trust NeMo's streaming logic for now, or check if we need to slice.
+        # Given we are using 'cache_aware_stream_step', it expects the full context window?
+        # Actually, standard usage is: input IS the new chunk, but internal convolution looks at past.
+        # But since we are stateless, we MUST provide the past.
+        # So passing (pre_cache + chunk) is correct.
+        # Cast lengths to Int32 for CoreML
+        encoded_len_32 = encoded_len.to(dtype=torch.int32)
+        new_channel_len_32 = new_cache_len.to(dtype=torch.int32)
+        return encoded, encoded_len_32, new_pre_cache, new_cache_channel, new_cache_time, new_channel_len_32
+def _coreml_convert(
+    traced_model,
+    inputs,
+    outputs,
+    compute_units=ct.ComputeUnit.CPU_ONLY
+):
+    return ct.convert(
+        traced_model,
+        inputs=inputs,
+        outputs=outputs,
+        compute_units=compute_units,
+        minimum_deployment_target=ct.target.macOS14,
+    )
+def main():
+    model_id: str = "nvidia/parakeet_realtime_eou_120m-v1"
+    output_dir: str = "temp_swift_models/StreamingLoopback"
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    print(f"Loading model: {model_id}...")
+    asr_model = EncDecRNNTBPEModel.from_pretrained(model_name=model_id)
+    asr_model.eval()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chunk-frames", type=int, default=17, help="Number of frames in the input chunk (e.g. 17 for 160ms, 129 for 1.28s)")
+    args = parser.parse_args()
+    encoder = asr_model.encoder
+    # --- Configuration ---
+    # 160ms chunk = 16 frames (but preprocessor produces 17 with padding/centering)
+    # 1.28s chunk = 128 frames (preprocessor produces 129)
+    chunk_size_in = args.chunk_frames
+    mel_dim = 128
+    hidden_dim = encoder.d_model # 512
+    num_layers = len(encoder.layers) # 17
+    # Cache sizes
+    cache_channel_size = 70
+    cache_time_size = 8
+    pre_cache_size = 16
+    print(f"Config: Chunk={chunk_size_in}, Mel={mel_dim}, Hidden={hidden_dim}, Layers={num_layers}")
+    print(f"Cache: Channel={cache_channel_size}, Time={cache_time_size}, Pre={pre_cache_size}")
+    # --- Wrapper ---
+    wrapper = LoopbackEncoderWrapper(encoder, pre_cache_size=pre_cache_size)
+    wrapper.eval()
+    # --- Test Inputs (for Tracing) ---
+    batch_size = 1
+    test_mel = torch.randn(batch_size, mel_dim, chunk_size_in)
+    test_mel_len = torch.tensor([chunk_size_in], dtype=torch.int32)
+    test_pre_cache = torch.zeros(batch_size, mel_dim, pre_cache_size)
+    # Initial Cache (Zeros)
+    test_cache_channel = torch.zeros(num_layers, batch_size, cache_channel_size, hidden_dim)
+    test_cache_time = torch.zeros(num_layers, batch_size, hidden_dim, cache_time_size)
+    test_cache_len = torch.zeros(batch_size, dtype=torch.int32)
+    print("Tracing model...")
+    traced_model = torch.jit.trace(
+        wrapper,
+        (test_mel, test_mel_len, test_pre_cache, test_cache_channel, test_cache_time, test_cache_len),
+        strict=False
+    )
+    # --- CoreML Conversion ---
+    print("Converting to CoreML...")
+    inputs = [
+        ct.TensorType(name="audio_signal", shape=(1, 128, chunk_size_in), dtype=np.float32),
+        ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
+        ct.TensorType(name="pre_cache", shape=(1, 128, pre_cache_size), dtype=np.float32),
+        ct.TensorType(name="cache_last_channel", shape=(num_layers, 1, cache_channel_size, hidden_dim), dtype=np.float32),
+        ct.TensorType(name="cache_last_time", shape=(num_layers, 1, hidden_dim, cache_time_size), dtype=np.float32),
+        ct.TensorType(name="cache_last_channel_len", shape=(1,), dtype=np.int32),
+    ]
+    outputs = [
+        ct.TensorType(name="encoded_output", dtype=np.float32),
+        ct.TensorType(name="encoded_length", dtype=np.int32),
+        ct.TensorType(name="new_pre_cache", dtype=np.float32),
+        ct.TensorType(name="new_cache_last_channel", dtype=np.float32),
+        ct.TensorType(name="new_cache_last_time", dtype=np.float32),
+        ct.TensorType(name="new_cache_last_channel_len", dtype=np.int32),
+    ]
+    mlmodel = _coreml_convert(traced_model, inputs, outputs)
+    save_path = output_path / "streaming_encoder.mlpackage"
+    mlmodel.save(str(save_path))
+    print(f"Saved: {save_path}")
+    # Also export Preprocessor, Decoder, Joint for completeness?
+    # For now, let's assume we reuse the existing ones or export them separately if needed.
+    # But the user asked specifically for the Encoder loopback.
+if __name__ == "__main__":
+    main()

160ms/decoder.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3996975a8cbc1949159c55605b3132b39b2484f51acbd55d796d93c70de02b49
+size 243

160ms/decoder.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3ccbff963d8cf07e2be2bd56ea3384a89ea49628922c6bd95ff62e2ae57dc34
+size 497

160ms/decoder.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,118 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "shortDescription" : "Parakeet EOU decoder (RNNT prediction network)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 640, 1]",
+        "name" : "decoder",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 640]",
+        "name" : "h_out",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 640]",
+        "name" : "c_out",
+        "type" : "MultiArray"
+      }
+    ],
+    "storagePrecision" : "Float16",
+    "modelParameters" : [
+    ],
+    "author" : "Fluid Inference",
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios17.squeeze" : 2,
+      "Ios17.gather" : 1,
+      "Ios17.cast" : 6,
+      "Ios17.lstm" : 1,
+      "Ios17.transpose" : 2,
+      "Identity" : 1,
+      "Ios17.expandDims" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1]",
+        "name" : "targets",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "target_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 640]",
+        "name" : "h_in",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 640]",
+        "name" : "c_in",
+        "type" : "MultiArray"
+      }
+    ],
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.version" : "8.3.0",
+      "com.github.apple.coremltools.source" : "torch==2.4.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "generatedClassName" : "parakeet_eou_decoder",
+    "method" : "predict"
+  }
+]

160ms/decoder.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,45 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
+{
+    func main<ios17>(tensor<fp32, [1, 1, 640]> c_in, tensor<fp32, [1, 1, 640]> h_in, tensor<int32, [1]> target_length, tensor<int32, [1, 1]> targets) {
+            tensor<int32, []> y_axis_0 = const()[name = tensor<string, []>("y_axis_0"), val = tensor<int32, []>(0)];
+            tensor<int32, []> y_batch_dims_0 = const()[name = tensor<string, []>("y_batch_dims_0"), val = tensor<int32, []>(0)];
+            tensor<bool, []> y_validate_indices_0 = const()[name = tensor<string, []>("y_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1027, 640]> module_prediction_embed_weight_to_fp16 = const()[name = tensor<string, []>("module_prediction_embed_weight_to_fp16"), val = tensor<fp16, [1027, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<string, []> targets_to_int16_dtype_0 = const()[name = tensor<string, []>("targets_to_int16_dtype_0"), val = tensor<string, []>("int16")];
+            tensor<int16, [1, 1]> targets_to_int16 = cast(dtype = targets_to_int16_dtype_0, x = targets)[name = tensor<string, []>("cast_8")];
+            tensor<fp16, [1, 1, 640]> y_cast_fp16_cast_uint16 = gather(axis = y_axis_0, batch_dims = y_batch_dims_0, indices = targets_to_int16, validate_indices = y_validate_indices_0, x = module_prediction_embed_weight_to_fp16)[name = tensor<string, []>("y_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<int32, [1]> input_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<string, []> h_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("h_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<fp16, [1, 1, 640]> h_in_to_fp16 = cast(dtype = h_in_to_fp16_dtype_0, x = h_in)[name = tensor<string, []>("cast_7")];
+            tensor<fp16, [1, 640]> input_lstm_h0_squeeze_cast_fp16 = squeeze(axes = input_lstm_h0_squeeze_axes_0, x = h_in_to_fp16)[name = tensor<string, []>("input_lstm_h0_squeeze_cast_fp16")];
+            tensor<int32, [1]> input_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<string, []> c_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("c_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<fp16, [1, 1, 640]> c_in_to_fp16 = cast(dtype = c_in_to_fp16_dtype_0, x = c_in)[name = tensor<string, []>("cast_6")];
+            tensor<fp16, [1, 640]> input_lstm_c0_squeeze_cast_fp16 = squeeze(axes = input_lstm_c0_squeeze_axes_0, x = c_in_to_fp16)[name = tensor<string, []>("input_lstm_c0_squeeze_cast_fp16")];
+            tensor<string, []> input_direction_0 = const()[name = tensor<string, []>("input_direction_0"), val = tensor<string, []>("forward")];
+            tensor<bool, []> input_output_sequence_0 = const()[name = tensor<string, []>("input_output_sequence_0"), val = tensor<bool, []>(true)];
+            tensor<string, []> input_recurrent_activation_0 = const()[name = tensor<string, []>("input_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
+            tensor<string, []> input_cell_activation_0 = const()[name = tensor<string, []>("input_cell_activation_0"), val = tensor<string, []>("tanh")];
+            tensor<string, []> input_activation_0 = const()[name = tensor<string, []>("input_activation_0"), val = tensor<string, []>("tanh")];
+            tensor<fp16, [2560, 640]> concat_1_to_fp16 = const()[name = tensor<string, []>("concat_1_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1314688)))];
+            tensor<fp16, [2560, 640]> concat_2_to_fp16 = const()[name = tensor<string, []>("concat_2_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4591552)))];
+            tensor<fp16, [2560]> concat_0_to_fp16 = const()[name = tensor<string, []>("concat_0_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7868416)))];
+            tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = y_cast_fp16_cast_uint16)[name = tensor<string, []>("transpose_2")];
+            tensor<fp16, [1, 1, 640]> input_cast_fp16_0, tensor<fp16, [1, 640]> input_cast_fp16_1, tensor<fp16, [1, 640]> input_cast_fp16_2 = lstm(activation = input_activation_0, bias = concat_0_to_fp16, cell_activation = input_cell_activation_0, direction = input_direction_0, initial_c = input_lstm_c0_squeeze_cast_fp16, initial_h = input_lstm_h0_squeeze_cast_fp16, output_sequence = input_output_sequence_0, recurrent_activation = input_recurrent_activation_0, weight_hh = concat_2_to_fp16, weight_ih = concat_1_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [1]> obj_3_axes_0 = const()[name = tensor<string, []>("obj_3_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 1, 640]> obj_3_cast_fp16 = expand_dims(axes = obj_3_axes_0, x = input_cast_fp16_1)[name = tensor<string, []>("obj_3_cast_fp16")];
+            tensor<string, []> obj_3_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_3_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, [1]> obj_axes_0 = const()[name = tensor<string, []>("obj_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 1, 640]> obj_cast_fp16 = expand_dims(axes = obj_axes_0, x = input_cast_fp16_2)[name = tensor<string, []>("obj_cast_fp16")];
+            tensor<string, []> obj_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, [3]> transpose_0_perm_0 = const()[name = tensor<string, []>("transpose_0_perm_0"), val = tensor<int32, [3]>([1, 2, 0])];
+            tensor<string, []> transpose_0_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("transpose_0_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<fp16, [1, 640, 1]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = input_cast_fp16_0)[name = tensor<string, []>("transpose_1")];
+            tensor<fp32, [1, 640, 1]> decoder = cast(dtype = transpose_0_cast_fp16_to_fp32_dtype_0, x = transpose_0_cast_fp16)[name = tensor<string, []>("cast_3")];
+            tensor<fp32, [1, 1, 640]> c_out = cast(dtype = obj_cast_fp16_to_fp32_dtype_0, x = obj_cast_fp16)[name = tensor<string, []>("cast_4")];
+            tensor<fp32, [1, 1, 640]> h_out = cast(dtype = obj_3_cast_fp16_to_fp32_dtype_0, x = obj_3_cast_fp16)[name = tensor<string, []>("cast_5")];
+            tensor<int32, [1]> target_length_tmp = identity(x = target_length)[name = tensor<string, []>("target_length_tmp")];
+        } -> (decoder, h_out, c_out);
+}

160ms/decoder.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b4cacecdcd9df79ab1e56de67230baf5a8664d2afe0bb8f3408eefa972cb2f4
+size 7873600

160ms/decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09f2dbd1f6a06faa6995f71d4b25d7c446996b6059cfac5ecc889853bdc7c6e5
+size 6728

160ms/decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b4cacecdcd9df79ab1e56de67230baf5a8664d2afe0bb8f3408eefa972cb2f4
+size 7873600

160ms/decoder.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "8201D73A-2B5D-488C-9C2B-7E2E75DF700D": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        },
+        "F8EEBE8D-F17D-4556-B8DF-9BC11701B36D": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        }
+    },
+    "rootModelIdentifier": "F8EEBE8D-F17D-4556-B8DF-9BC11701B36D"
+}

160ms/individual_components.py ADDED Viewed

	@@ -0,0 +1,250 @@

+#!/usr/bin/env python3
+"""Export Parakeet Realtime EOU 120M RNNT components into CoreML."""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Tuple
+import coremltools as ct
+import torch
+@dataclass
+class ExportSettings:
+    output_dir: Path
+    compute_units: ct.ComputeUnit
+    deployment_target: Optional[ct.target]
+    compute_precision: Optional[ct.precision]
+    max_audio_seconds: float
+    max_symbol_steps: int
+class PreprocessorWrapper(torch.nn.Module):
+    """Wrapper for the audio preprocessor (mel spectrogram extraction)."""
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(
+        self, audio_signal: torch.Tensor, length: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        mel, mel_length = self.module(
+            input_signal=audio_signal, length=length.to(dtype=torch.long)
+        )
+        return mel, mel_length
+class EncoderWrapper(torch.nn.Module):
+    """Wrapper for the cache-aware FastConformer encoder.
+    Note: For the realtime EOU model, the encoder is cache-aware which means
+    it can operate in a streaming fashion. For CoreML export, we export
+    without cache state for simplicity (full-context mode).
+    """
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(
+        self, features: torch.Tensor, length: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        encoded, encoded_lengths = self.module(
+            audio_signal=features, length=length.to(dtype=torch.long)
+        )
+        # Synthesize per-frame timestamps (seconds) using the 80 ms encoder stride.
+        # Shape: [B, T_enc]
+        frame_times = (
+            torch.arange(encoded.shape[-1], device=encoded.device, dtype=torch.float32)
+            * 0.08
+        )
+        return encoded, encoded_lengths, frame_times
+class DecoderWrapper(torch.nn.Module):
+    """Wrapper for the RNNT prediction network (decoder)."""
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(
+        self,
+        targets: torch.Tensor,
+        target_lengths: torch.Tensor,
+        h_in: torch.Tensor,
+        c_in: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        state = [h_in, c_in]
+        decoder_output, _, new_state = self.module(
+            targets=targets.to(dtype=torch.long),
+            target_length=target_lengths.to(dtype=torch.long),
+            states=state,
+        )
+        return decoder_output, new_state[0], new_state[1]
+class JointWrapper(torch.nn.Module):
+    """Wrapper for the RNNT joint network.
+    Note: Unlike Parakeet TDT v3, the realtime EOU model does NOT have
+    duration outputs (num_extra_outputs). The joint network outputs only
+    token logits over the vocabulary + blank.
+    """
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(
+        self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor
+    ) -> torch.Tensor:
+        # Input: encoder_outputs [B, D, T], decoder_outputs [B, D, U]
+        # Transpose to match what projection layers expect
+        encoder_outputs = encoder_outputs.transpose(1, 2)  # [B, T, D]
+        decoder_outputs = decoder_outputs.transpose(1, 2)  # [B, U, D]
+        # Apply projections
+        enc_proj = self.module.enc(encoder_outputs)  # [B, T, joint_hidden]
+        dec_proj = self.module.pred(decoder_outputs)  # [B, U, joint_hidden]
+        # Explicit broadcasting along T and U to avoid converter ambiguity
+        x = enc_proj.unsqueeze(2) + dec_proj.unsqueeze(1)  # [B, T, U, joint_hidden]
+        x = self.module.joint_net[0](x)  # ReLU
+        x = self.module.joint_net[1](x)  # Dropout (no-op in eval)
+        out = self.module.joint_net[2](x)  # Linear -> logits [B, T, U, vocab+blank]
+        return out
+class MelEncoderWrapper(torch.nn.Module):
+    """Fused wrapper: waveform -> mel -> encoder.
+    Inputs:
+      - audio_signal: [B, S]
+      - audio_length: [B]
+    Outputs:
+      - encoder: [B, D, T_enc]
+      - encoder_length: [B]
+      - frame_times: [T_enc]
+    """
+    def __init__(
+        self, preprocessor: PreprocessorWrapper, encoder: EncoderWrapper
+    ) -> None:
+        super().__init__()
+        self.preprocessor = preprocessor
+        self.encoder = encoder
+    def forward(
+        self, audio_signal: torch.Tensor, audio_length: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        mel, mel_length = self.preprocessor(audio_signal, audio_length)
+        encoded, enc_len, frame_times = self.encoder(mel, mel_length.to(dtype=torch.int32))
+        return encoded, enc_len, frame_times
+class JointDecisionWrapper(torch.nn.Module):
+    """Joint + decision head: outputs label id and label prob.
+    Unlike Parakeet TDT v3, this model does NOT have duration outputs.
+    Inputs:
+      - encoder_outputs: [B, D, T]
+      - decoder_outputs: [B, D, U]
+    Returns:
+      - token_id: [B, T, U] int32
+      - token_prob: [B, T, U] float32
+    """
+    def __init__(self, joint: JointWrapper, vocab_size: int) -> None:
+        super().__init__()
+        self.joint = joint
+        self.vocab_with_blank = int(vocab_size) + 1
+    def forward(self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor):
+        logits = self.joint(encoder_outputs, decoder_outputs)
+        # Token selection
+        token_ids = torch.argmax(logits, dim=-1).to(dtype=torch.int32)
+        token_probs_all = torch.softmax(logits, dim=-1)
+        # gather expects int64 (long) indices; cast only for gather
+        token_prob = torch.gather(
+            token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
+        ).squeeze(-1)
+        return token_ids, token_prob
+class JointDecisionSingleStep(torch.nn.Module):
+    """Single-step variant for streaming: encoder_step -> token decision.
+    Inputs:
+      - encoder_step: [B=1, D, T=1]
+      - decoder_step: [B=1, D, U=1]
+    Returns:
+      - token_id: [1, 1, 1] int32
+      - token_prob: [1, 1, 1] float32
+      - top_k_ids: [1, 1, 1, K] int32
+      - top_k_logits: [1, 1, 1, K] float32
+    """
+    def __init__(self, joint: JointWrapper, vocab_size: int, top_k: int = 64) -> None:
+        super().__init__()
+        self.joint = joint
+        self.vocab_with_blank = int(vocab_size) + 1
+        self.top_k = int(top_k)
+    def forward(self, encoder_step: torch.Tensor, decoder_step: torch.Tensor):
+        # Reuse JointWrapper which expects [B, D, T] and [B, D, U]
+        logits = self.joint(encoder_step, decoder_step)  # [1, 1, 1, V+blank]
+        token_ids = torch.argmax(logits, dim=-1, keepdim=False).to(dtype=torch.int32)
+        token_probs_all = torch.softmax(logits, dim=-1)
+        token_prob = torch.gather(
+            token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
+        ).squeeze(-1)
+        # Also expose top-K candidates for host-side processing
+        topk_logits, topk_ids_long = torch.topk(
+            logits, k=min(self.top_k, logits.shape[-1]), dim=-1
+        )
+        topk_ids = topk_ids_long.to(dtype=torch.int32)
+        return token_ids, token_prob, topk_ids, topk_logits
+def _coreml_convert(
+    traced: torch.jit.ScriptModule,
+    inputs,
+    outputs,
+    settings: ExportSettings,
+    compute_units_override: Optional[ct.ComputeUnit] = None,
+    compute_precision: Optional[ct.precision] = None,
+) -> ct.models.MLModel:
+    cu = (
+        compute_units_override
+        if compute_units_override is not None
+        else settings.compute_units
+    )
+    kwargs = {
+        "convert_to": "mlprogram",
+        "inputs": inputs,
+        "outputs": outputs,
+        "compute_units": cu,
+    }
+    print("Converting:", traced.__class__.__name__)
+    print("Conversion kwargs:", kwargs)
+    if settings.deployment_target is not None:
+        kwargs["minimum_deployment_target"] = settings.deployment_target
+    # Priority: explicit argument > settings
+    if compute_precision is not None:
+        kwargs["compute_precision"] = compute_precision
+    elif settings.compute_precision is not None:
+        kwargs["compute_precision"] = settings.compute_precision
+    return ct.convert(traced, **kwargs)

160ms/joint_decision.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5bca32ad130dcad6605cc00044c752aa5b45ef57d14c17f2d1a2fa49d6cf55b5
+size 243

160ms/joint_decision.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22d4abc4625b935ee035b5f8ce7cb28d1041b9b01c12173e287bf4b5f5d99625
+size 493

160ms/joint_decision.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,112 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "shortDescription" : "Parakeet EOU single-step joint decision",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1]",
+        "name" : "token_id",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1]",
+        "name" : "token_prob",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 1 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 64]",
+        "name" : "top_k_ids",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 64]",
+        "name" : "top_k_logits",
+        "type" : "MultiArray"
+      }
+    ],
+    "storagePrecision" : "Float16",
+    "modelParameters" : [
+    ],
+    "author" : "Fluid Inference",
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios17.reduceArgmax" : 1,
+      "Ios17.squeeze" : 1,
+      "Ios17.cast" : 6,
+      "Ios17.linear" : 3,
+      "Ios17.transpose" : 2,
+      "Ios17.add" : 1,
+      "Ios16.relu" : 1,
+      "Ios16.softmax" : 1,
+      "Ios17.gatherAlongAxis" : 1,
+      "Ios17.topk" : 1,
+      "Ios17.expandDims" : 3
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 512 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1]",
+        "name" : "encoder_step",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 640, 1]",
+        "name" : "decoder_step",
+        "type" : "MultiArray"
+      }
+    ],
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.version" : "8.3.0",
+      "com.github.apple.coremltools.source" : "torch==2.4.0"
+    },
+    "generatedClassName" : "parakeet_eou_joint_decision_single_step",
+    "method" : "predict"
+  }
+]

160ms/joint_decision.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,57 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
+{
+    func main<ios17>(tensor<fp32, [1, 640, 1]> decoder_step, tensor<fp32, [1, 512, 1]> encoder_step) {
+            tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<string, []> encoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<string, []> decoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
+            tensor<fp16, [1, 512, 1]> encoder_step_to_fp16 = cast(dtype = encoder_step_to_fp16_dtype_0, x = encoder_step)[name = tensor<string, []>("cast_8")];
+            tensor<fp16, [1, 1, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_step_to_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 1, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
+            tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
+            tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
+            tensor<fp16, [1, 640, 1]> decoder_step_to_fp16 = cast(dtype = decoder_step_to_fp16_dtype_0, x = decoder_step)[name = tensor<string, []>("cast_7")];
+            tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_step_to_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
+            tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
+            tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<fp16, [1027, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1027, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
+            tensor<fp16, [1027]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1027]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2792064)))];
+            tensor<fp16, [1, 1, 1, 1027]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
+            tensor<int32, []> var_38_axis_0 = const()[name = tensor<string, []>("op_38_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_38_keep_dims_0 = const()[name = tensor<string, []>("op_38_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<string, []> var_38_output_dtype_0 = const()[name = tensor<string, []>("op_38_output_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, [1, 1, 1]> token_id = reduce_argmax(axis = var_38_axis_0, keep_dims = var_38_keep_dims_0, output_dtype = var_38_output_dtype_0, x = linear_2_cast_fp16)[name = tensor<string, []>("op_38_cast_fp16")];
+            tensor<int32, []> var_44 = const()[name = tensor<string, []>("op_44"), val = tensor<int32, []>(-1)];
+            tensor<fp16, [1, 1, 1, 1027]> token_probs_all_cast_fp16 = softmax(axis = var_44, x = linear_2_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
+            tensor<int32, [1]> var_53_axes_0 = const()[name = tensor<string, []>("op_53_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<int32, [1, 1, 1, 1]> var_53 = expand_dims(axes = var_53_axes_0, x = token_id)[name = tensor<string, []>("op_53")];
+            tensor<int32, []> var_54 = const()[name = tensor<string, []>("op_54"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_56_validate_indices_0 = const()[name = tensor<string, []>("op_56_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<string, []> var_53_to_int16_dtype_0 = const()[name = tensor<string, []>("op_53_to_int16_dtype_0"), val = tensor<string, []>("int16")];
+            tensor<int16, [1, 1, 1, 1]> var_53_to_int16 = cast(dtype = var_53_to_int16_dtype_0, x = var_53)[name = tensor<string, []>("cast_6")];
+            tensor<fp16, [1, 1, 1, 1]> var_56_cast_fp16_cast_int16 = gather_along_axis(axis = var_54, indices = var_53_to_int16, validate_indices = var_56_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_56_cast_fp16_cast_int16")];
+            tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp16, [1, 1, 1]> var_58_cast_fp16 = squeeze(axes = var_58_axes_0, x = var_56_cast_fp16_cast_int16)[name = tensor<string, []>("op_58_cast_fp16")];
+            tensor<string, []> var_58_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_58_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(64)];
+            tensor<int32, []> var_63_axis_0 = const()[name = tensor<string, []>("op_63_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_63_ascending_0 = const()[name = tensor<string, []>("op_63_ascending_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_63_sort_0 = const()[name = tensor<string, []>("op_63_sort_0"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_63_return_indices_0 = const()[name = tensor<string, []>("op_63_return_indices_0"), val = tensor<bool, []>(true)];
+            tensor<string, []> var_63_cast_fp16_cast_int16_output_indices_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_cast_int16_output_indices_dtype_0"), val = tensor<string, []>("uint16")];
+            tensor<fp16, [1, 1, 1, 64]> var_63_cast_fp16_cast_int16_0, tensor<uint16, [1, 1, 1, 64]> var_63_cast_fp16_cast_int16_1 = topk(ascending = var_63_ascending_0, axis = var_63_axis_0, k = var_59, output_indices_dtype = var_63_cast_fp16_cast_int16_output_indices_dtype_0, return_indices = var_63_return_indices_0, sort = var_63_sort_0, x = linear_2_cast_fp16)[name = tensor<string, []>("op_63_cast_fp16_cast_int16")];
+            tensor<string, []> var_63_cast_fp16_cast_int16_1_to_int32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_cast_int16_1_to_int32_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<string, []> var_63_cast_fp16_0_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_0_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<fp32, [1, 1, 1, 64]> top_k_logits = cast(dtype = var_63_cast_fp16_0_to_fp32_dtype_0, x = var_63_cast_fp16_cast_int16_0)[name = tensor<string, []>("cast_3")];
+            tensor<int32, [1, 1, 1, 64]> top_k_ids = cast(dtype = var_63_cast_fp16_cast_int16_1_to_int32_dtype_0, x = var_63_cast_fp16_cast_int16_1)[name = tensor<string, []>("cast_4")];
+            tensor<fp32, [1, 1, 1]> token_prob = cast(dtype = var_58_cast_fp16_to_fp32_dtype_0, x = var_58_cast_fp16)[name = tensor<string, []>("cast_5")];
+        } -> (token_id, token_prob, top_k_ids, top_k_logits);
+}

160ms/joint_decision.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7039b2010a269153f5a96edf28637f921a86ef8822f248f2d6712f7a6bce84b4
+size 2794182

160ms/joint_decision.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25d4d7be6eeb60c7de1d3a1278a5a4700cbe34017e1a8c1cab33204ddb2e4d5e
+size 8701

160ms/joint_decision.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7039b2010a269153f5a96edf28637f921a86ef8822f248f2d6712f7a6bce84b4
+size 2794182

160ms/joint_decision.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "634E266B-4447-41D3-879E-F3611888F54B": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        },
+        "C7F40527-180B-45CD-BC12-4F054F2E5D9A": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        }
+    },
+    "rootModelIdentifier": "C7F40527-180B-45CD-BC12-4F054F2E5D9A"
+}

160ms/parakeet_eou_preprocessor.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4ada8b0b99ac1d2ba7acbffacfbbf1a06cb69d30e9410d237ee0aa4c2b0ad63
+size 243