Commit ·
1b8ea0e
0
Parent(s):
Duplicate from FluidInference/parakeet-realtime-eou-120m-coreml
Browse filesCo-authored-by: Alex Weng <alexwengg@users.noreply.huggingface.co>
This view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +35 -0
- 1280ms/.DS_Store +0 -0
- 1280ms/convert_parakeet_eou.py +740 -0
- 1280ms/convert_streaming_encoder.py +193 -0
- 1280ms/decoder.mlmodelc/analytics/coremldata.bin +3 -0
- 1280ms/decoder.mlmodelc/coremldata.bin +3 -0
- 1280ms/decoder.mlmodelc/metadata.json +118 -0
- 1280ms/decoder.mlmodelc/model.mil +45 -0
- 1280ms/decoder.mlmodelc/weights/weight.bin +3 -0
- 1280ms/individual_components.py +250 -0
- 1280ms/joint_decision.mlmodelc/analytics/coremldata.bin +3 -0
- 1280ms/joint_decision.mlmodelc/coremldata.bin +3 -0
- 1280ms/joint_decision.mlmodelc/metadata.json +112 -0
- 1280ms/joint_decision.mlmodelc/model.mil +57 -0
- 1280ms/joint_decision.mlmodelc/weights/weight.bin +3 -0
- 1280ms/parakeet_eou_preprocessor.mlmodelc/analytics/coremldata.bin +3 -0
- 1280ms/parakeet_eou_preprocessor.mlmodelc/coremldata.bin +3 -0
- 1280ms/parakeet_eou_preprocessor.mlmodelc/metadata.json +105 -0
- 1280ms/parakeet_eou_preprocessor.mlmodelc/model.mil +96 -0
- 1280ms/parakeet_eou_preprocessor.mlmodelc/weights/weight.bin +3 -0
- 1280ms/streaming_encoder.mlmodelc/analytics/coremldata.bin +3 -0
- 1280ms/streaming_encoder.mlmodelc/coremldata.bin +3 -0
- 1280ms/streaming_encoder.mlmodelc/metadata.json +187 -0
- 1280ms/streaming_encoder.mlmodelc/model.mil +0 -0
- 1280ms/streaming_encoder.mlmodelc/weights/weight.bin +3 -0
- 1280ms/streaming_encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- 1280ms/streaming_encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- 1280ms/streaming_encoder.mlpackage/Manifest.json +18 -0
- 1280ms/vocab.json +1028 -0
- 160ms/.DS_Store +0 -0
- 160ms/convert_parakeet_eou.py +740 -0
- 160ms/convert_streaming_encoder.py +193 -0
- 160ms/decoder.mlmodelc/analytics/coremldata.bin +3 -0
- 160ms/decoder.mlmodelc/coremldata.bin +3 -0
- 160ms/decoder.mlmodelc/metadata.json +118 -0
- 160ms/decoder.mlmodelc/model.mil +45 -0
- 160ms/decoder.mlmodelc/weights/weight.bin +3 -0
- 160ms/decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- 160ms/decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- 160ms/decoder.mlpackage/Manifest.json +18 -0
- 160ms/individual_components.py +250 -0
- 160ms/joint_decision.mlmodelc/analytics/coremldata.bin +3 -0
- 160ms/joint_decision.mlmodelc/coremldata.bin +3 -0
- 160ms/joint_decision.mlmodelc/metadata.json +112 -0
- 160ms/joint_decision.mlmodelc/model.mil +57 -0
- 160ms/joint_decision.mlmodelc/weights/weight.bin +3 -0
- 160ms/joint_decision.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- 160ms/joint_decision.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- 160ms/joint_decision.mlpackage/Manifest.json +18 -0
- 160ms/parakeet_eou_preprocessor.mlmodelc/analytics/coremldata.bin +3 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
1280ms/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
1280ms/convert_parakeet_eou.py
ADDED
|
@@ -0,0 +1,740 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""CLI for exporting Parakeet Realtime EOU 120M components to CoreML.
|
| 3 |
+
|
| 4 |
+
This model is a cache-aware streaming FastConformer-RNNT model optimized for
|
| 5 |
+
low-latency speech recognition with end-of-utterance detection.
|
| 6 |
+
|
| 7 |
+
Key differences from Parakeet TDT v3:
|
| 8 |
+
- Smaller model (120M vs 600M params)
|
| 9 |
+
- No duration outputs (standard RNNT, not TDT)
|
| 10 |
+
- Cache-aware streaming encoder (17 layers, attention context [70,1])
|
| 11 |
+
- Special <EOU> token for end-of-utterance detection
|
| 12 |
+
- Optimized for 80-160ms latency
|
| 13 |
+
|
| 14 |
+
Reference: https://huggingface.co/nvidia/parakeet_realtime_eou_120m-v1
|
| 15 |
+
"""
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import json
|
| 19 |
+
from dataclasses import asdict
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from typing import Dict, Optional, Tuple
|
| 22 |
+
|
| 23 |
+
import coremltools as ct
|
| 24 |
+
import numpy as np
|
| 25 |
+
import soundfile as sf
|
| 26 |
+
import torch
|
| 27 |
+
import typer
|
| 28 |
+
|
| 29 |
+
import nemo.collections.asr as nemo_asr
|
| 30 |
+
|
| 31 |
+
from individual_components import (
|
| 32 |
+
DecoderWrapper,
|
| 33 |
+
EncoderWrapper,
|
| 34 |
+
ExportSettings,
|
| 35 |
+
JointWrapper,
|
| 36 |
+
JointDecisionWrapper,
|
| 37 |
+
JointDecisionSingleStep,
|
| 38 |
+
PreprocessorWrapper,
|
| 39 |
+
MelEncoderWrapper,
|
| 40 |
+
_coreml_convert,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
def apply_stft_patch():
|
| 44 |
+
# Monkey patch coremltools.stft to handle extra arguments from newer torch versions
|
| 45 |
+
try:
|
| 46 |
+
import coremltools.converters.mil.frontend.torch.ops as torch_ops
|
| 47 |
+
_original_stft = torch_ops.stft
|
| 48 |
+
|
| 49 |
+
def patched_stft(context, node):
|
| 50 |
+
if len(node.inputs) > 8:
|
| 51 |
+
node.inputs = node.inputs[:8]
|
| 52 |
+
return _original_stft(context, node)
|
| 53 |
+
|
| 54 |
+
torch_ops.stft = patched_stft
|
| 55 |
+
if "stft" in torch_ops._TORCH_OPS_REGISTRY:
|
| 56 |
+
torch_ops._TORCH_OPS_REGISTRY["stft"] = patched_stft
|
| 57 |
+
print("Monkey patched coremltools.stft for compatibility.")
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f"Warning: Could not monkey patch stft: {e}")
|
| 60 |
+
|
| 61 |
+
DEFAULT_MODEL_ID = "nvidia/parakeet_realtime_eou_120m-v1"
|
| 62 |
+
AUTHOR = "Fluid Inference"
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _compute_length(seconds: float, sample_rate: int) -> int:
|
| 66 |
+
return int(round(seconds * sample_rate))
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _prepare_audio(
|
| 70 |
+
validation_audio: Optional[Path],
|
| 71 |
+
sample_rate: int,
|
| 72 |
+
max_samples: int,
|
| 73 |
+
seed: Optional[int],
|
| 74 |
+
) -> torch.Tensor:
|
| 75 |
+
if validation_audio is None:
|
| 76 |
+
if seed is not None:
|
| 77 |
+
torch.manual_seed(seed)
|
| 78 |
+
audio = torch.randn(1, max_samples, dtype=torch.float32)
|
| 79 |
+
return audio
|
| 80 |
+
|
| 81 |
+
data, sr = sf.read(str(validation_audio), dtype="float32")
|
| 82 |
+
if sr != sample_rate:
|
| 83 |
+
raise typer.BadParameter(
|
| 84 |
+
f"Validation audio sample rate {sr} does not match model rate {sample_rate}"
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
if data.ndim > 1:
|
| 88 |
+
data = data[:, 0]
|
| 89 |
+
|
| 90 |
+
if data.size == 0:
|
| 91 |
+
raise typer.BadParameter("Validation audio is empty")
|
| 92 |
+
|
| 93 |
+
if data.size < max_samples:
|
| 94 |
+
pad_width = max_samples - data.size
|
| 95 |
+
data = np.pad(data, (0, pad_width))
|
| 96 |
+
elif data.size > max_samples:
|
| 97 |
+
data = data[:max_samples]
|
| 98 |
+
|
| 99 |
+
audio = torch.from_numpy(data).unsqueeze(0).to(dtype=torch.float32)
|
| 100 |
+
return audio
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def _save_mlpackage(model: ct.models.MLModel, path: Path, description: str) -> None:
|
| 104 |
+
try:
|
| 105 |
+
model.minimum_deployment_target = ct.target.iOS17
|
| 106 |
+
except Exception:
|
| 107 |
+
pass
|
| 108 |
+
model.short_description = description
|
| 109 |
+
model.author = AUTHOR
|
| 110 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 111 |
+
model.save(str(path))
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def _tensor_shape(tensor: torch.Tensor) -> Tuple[int, ...]:
|
| 115 |
+
return tuple(int(dim) for dim in tensor.shape)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _parse_compute_units(name: str) -> ct.ComputeUnit:
|
| 119 |
+
"""Parse a human-friendly compute units string into ct.ComputeUnit."""
|
| 120 |
+
normalized = str(name).strip().upper()
|
| 121 |
+
mapping = {
|
| 122 |
+
"ALL": ct.ComputeUnit.ALL,
|
| 123 |
+
"CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
|
| 124 |
+
"CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
|
| 125 |
+
"CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
|
| 126 |
+
"CPU_AND_NEURALENGINE": ct.ComputeUnit.CPU_AND_NE,
|
| 127 |
+
}
|
| 128 |
+
if normalized not in mapping:
|
| 129 |
+
raise typer.BadParameter(
|
| 130 |
+
f"Unknown compute units '{name}'. Choose from: " + ", ".join(mapping.keys())
|
| 131 |
+
)
|
| 132 |
+
return mapping[normalized]
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def _parse_compute_precision(name: Optional[str]) -> Optional[ct.precision]:
|
| 136 |
+
"""Parse compute precision string into ct.precision or None."""
|
| 137 |
+
if name is None:
|
| 138 |
+
return None
|
| 139 |
+
normalized = str(name).strip().upper()
|
| 140 |
+
if normalized == "":
|
| 141 |
+
return None
|
| 142 |
+
mapping = {
|
| 143 |
+
"FLOAT32": ct.precision.FLOAT32,
|
| 144 |
+
"FLOAT16": ct.precision.FLOAT16,
|
| 145 |
+
}
|
| 146 |
+
if normalized not in mapping:
|
| 147 |
+
raise typer.BadParameter(
|
| 148 |
+
f"Unknown compute precision '{name}'. Choose from: "
|
| 149 |
+
+ ", ".join(mapping.keys())
|
| 150 |
+
)
|
| 151 |
+
return mapping[normalized]
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
@app.command()
|
| 158 |
+
def convert(
|
| 159 |
+
nemo_path: Optional[Path] = typer.Option(
|
| 160 |
+
None,
|
| 161 |
+
"--nemo-path",
|
| 162 |
+
exists=True,
|
| 163 |
+
resolve_path=True,
|
| 164 |
+
help="Path to parakeet_realtime_eou_120m-v1.nemo checkpoint (skip to auto-download)",
|
| 165 |
+
),
|
| 166 |
+
model_id: str = typer.Option(
|
| 167 |
+
DEFAULT_MODEL_ID,
|
| 168 |
+
"--model-id",
|
| 169 |
+
help="Model identifier to download when --nemo-path is omitted",
|
| 170 |
+
),
|
| 171 |
+
output_dir: Path = typer.Option(
|
| 172 |
+
Path("parakeet_eou_coreml"),
|
| 173 |
+
help="Directory where mlpackages and metadata will be written",
|
| 174 |
+
),
|
| 175 |
+
preprocessor_cu: str = typer.Option(
|
| 176 |
+
"CPU_ONLY",
|
| 177 |
+
"--preprocessor-cu",
|
| 178 |
+
help="Compute units for preprocessor (default CPU_ONLY)",
|
| 179 |
+
),
|
| 180 |
+
mel_encoder_cu: str = typer.Option(
|
| 181 |
+
"CPU_ONLY",
|
| 182 |
+
"--mel-encoder-cu",
|
| 183 |
+
help="Compute units for fused mel+encoder (default CPU_ONLY)",
|
| 184 |
+
),
|
| 185 |
+
compute_precision: Optional[str] = typer.Option(
|
| 186 |
+
None,
|
| 187 |
+
"--compute-precision",
|
| 188 |
+
help="Export precision: FLOAT32 (default) or FLOAT16 to shrink non-quantized weights.",
|
| 189 |
+
),
|
| 190 |
+
max_audio_seconds: float = typer.Option(
|
| 191 |
+
15.0,
|
| 192 |
+
"--max-audio-seconds",
|
| 193 |
+
help="Maximum audio duration in seconds for the fixed window export",
|
| 194 |
+
),
|
| 195 |
+
validation_audio: Optional[Path] = typer.Option(
|
| 196 |
+
None,
|
| 197 |
+
"--validation-audio",
|
| 198 |
+
exists=True,
|
| 199 |
+
resolve_path=True,
|
| 200 |
+
help="Path to a 16kHz WAV file for tracing (uses random if not provided)",
|
| 201 |
+
),
|
| 202 |
+
) -> None:
|
| 203 |
+
"""Export all Parakeet Realtime EOU sub-modules to CoreML.
|
| 204 |
+
|
| 205 |
+
This exports the cache-aware streaming FastConformer-RNNT model for
|
| 206 |
+
low-latency speech recognition with end-of-utterance detection.
|
| 207 |
+
"""
|
| 208 |
+
export_settings = ExportSettings(
|
| 209 |
+
output_dir=output_dir,
|
| 210 |
+
compute_units=ct.ComputeUnit.CPU_ONLY,
|
| 211 |
+
deployment_target=ct.target.iOS17,
|
| 212 |
+
compute_precision=_parse_compute_precision(compute_precision),
|
| 213 |
+
max_audio_seconds=max_audio_seconds,
|
| 214 |
+
max_symbol_steps=1,
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
typer.echo("Export configuration:")
|
| 218 |
+
typer.echo(asdict(export_settings))
|
| 219 |
+
|
| 220 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 221 |
+
pre_cu = _parse_compute_units(preprocessor_cu)
|
| 222 |
+
melenc_cu = _parse_compute_units(mel_encoder_cu)
|
| 223 |
+
|
| 224 |
+
if nemo_path is not None:
|
| 225 |
+
typer.echo(f"Loading NeMo model from {nemo_path}…")
|
| 226 |
+
# Try loading as generic ASRModel first, then specific class
|
| 227 |
+
try:
|
| 228 |
+
asr_model = nemo_asr.models.ASRModel.restore_from(
|
| 229 |
+
str(nemo_path), map_location="cpu"
|
| 230 |
+
)
|
| 231 |
+
except Exception:
|
| 232 |
+
# Fallback to EncDecRNNTBPEModel
|
| 233 |
+
asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
|
| 234 |
+
str(nemo_path), map_location="cpu"
|
| 235 |
+
)
|
| 236 |
+
checkpoint_meta = {
|
| 237 |
+
"type": "file",
|
| 238 |
+
"path": str(nemo_path),
|
| 239 |
+
}
|
| 240 |
+
else:
|
| 241 |
+
typer.echo(f"Downloading NeMo model via {model_id}…")
|
| 242 |
+
# Use ASRModel.from_pretrained as recommended for this model
|
| 243 |
+
try:
|
| 244 |
+
asr_model = nemo_asr.models.ASRModel.from_pretrained(
|
| 245 |
+
model_id, map_location="cpu"
|
| 246 |
+
)
|
| 247 |
+
except Exception:
|
| 248 |
+
# Fallback to EncDecRNNTBPEModel
|
| 249 |
+
asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
|
| 250 |
+
model_id, map_location="cpu"
|
| 251 |
+
)
|
| 252 |
+
checkpoint_meta = {
|
| 253 |
+
"type": "pretrained",
|
| 254 |
+
"model_id": model_id,
|
| 255 |
+
}
|
| 256 |
+
asr_model.eval()
|
| 257 |
+
|
| 258 |
+
# Print model info
|
| 259 |
+
typer.echo(f"Model class: {type(asr_model).__name__}")
|
| 260 |
+
typer.echo(f"Encoder class: {type(asr_model.encoder).__name__}")
|
| 261 |
+
|
| 262 |
+
sample_rate = int(asr_model.cfg.preprocessor.sample_rate)
|
| 263 |
+
max_samples = _compute_length(export_settings.max_audio_seconds, sample_rate)
|
| 264 |
+
|
| 265 |
+
# Prepare audio for tracing
|
| 266 |
+
if validation_audio is not None:
|
| 267 |
+
typer.echo(f"Using validation audio: {validation_audio}")
|
| 268 |
+
audio_tensor = _prepare_audio(validation_audio, sample_rate, max_samples, seed=None)
|
| 269 |
+
else:
|
| 270 |
+
typer.echo("Using random audio for tracing (seed=42)")
|
| 271 |
+
audio_tensor = _prepare_audio(None, sample_rate, max_samples, seed=42)
|
| 272 |
+
|
| 273 |
+
audio_length = torch.tensor([max_samples], dtype=torch.int32)
|
| 274 |
+
|
| 275 |
+
preprocessor = PreprocessorWrapper(asr_model.preprocessor.eval())
|
| 276 |
+
encoder = EncoderWrapper(asr_model.encoder.eval())
|
| 277 |
+
decoder = DecoderWrapper(asr_model.decoder.eval())
|
| 278 |
+
joint = JointWrapper(asr_model.joint.eval())
|
| 279 |
+
|
| 280 |
+
decoder_export_flag = getattr(asr_model.decoder, "_rnnt_export", False)
|
| 281 |
+
asr_model.decoder._rnnt_export = True
|
| 282 |
+
|
| 283 |
+
try:
|
| 284 |
+
with torch.no_grad():
|
| 285 |
+
mel_ref, mel_length_ref = preprocessor(audio_tensor, audio_length)
|
| 286 |
+
mel_length_ref = mel_length_ref.to(dtype=torch.int32)
|
| 287 |
+
encoder_ref, encoder_length_ref, frame_times_ref = encoder(
|
| 288 |
+
mel_ref, mel_length_ref
|
| 289 |
+
)
|
| 290 |
+
encoder_length_ref = encoder_length_ref.to(dtype=torch.int32)
|
| 291 |
+
|
| 292 |
+
# Clone tensors to drop inference flags
|
| 293 |
+
mel_ref = mel_ref.clone().detach()
|
| 294 |
+
mel_length_ref = mel_length_ref.clone().detach()
|
| 295 |
+
encoder_ref = encoder_ref.clone().detach()
|
| 296 |
+
encoder_length_ref = encoder_length_ref.clone().detach()
|
| 297 |
+
frame_times_ref = frame_times_ref.clone().detach()
|
| 298 |
+
|
| 299 |
+
vocab_size = int(asr_model.tokenizer.vocab_size)
|
| 300 |
+
decoder_hidden = int(asr_model.decoder.pred_hidden)
|
| 301 |
+
decoder_layers = int(asr_model.decoder.pred_rnn_layers)
|
| 302 |
+
|
| 303 |
+
# Check if model has extra outputs (TDT-style duration)
|
| 304 |
+
num_extra = getattr(asr_model.joint, "num_extra_outputs", 0)
|
| 305 |
+
typer.echo(f"Vocab size: {vocab_size}, num_extra_outputs: {num_extra}")
|
| 306 |
+
|
| 307 |
+
targets = torch.full(
|
| 308 |
+
(1, export_settings.max_symbol_steps),
|
| 309 |
+
fill_value=asr_model.decoder.blank_idx,
|
| 310 |
+
dtype=torch.int32,
|
| 311 |
+
)
|
| 312 |
+
target_lengths = torch.tensor(
|
| 313 |
+
[export_settings.max_symbol_steps], dtype=torch.int32
|
| 314 |
+
)
|
| 315 |
+
zero_state = torch.zeros(
|
| 316 |
+
decoder_layers,
|
| 317 |
+
1,
|
| 318 |
+
decoder_hidden,
|
| 319 |
+
dtype=torch.float32,
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
with torch.no_grad():
|
| 323 |
+
decoder_ref, h_ref, c_ref = decoder(
|
| 324 |
+
targets, target_lengths, zero_state, zero_state
|
| 325 |
+
)
|
| 326 |
+
joint_ref = joint(encoder_ref, decoder_ref)
|
| 327 |
+
|
| 328 |
+
decoder_ref = decoder_ref.clone()
|
| 329 |
+
h_ref = h_ref.clone()
|
| 330 |
+
c_ref = c_ref.clone()
|
| 331 |
+
joint_ref = joint_ref.clone()
|
| 332 |
+
|
| 333 |
+
typer.echo(f"Encoder output shape: {encoder_ref.shape}")
|
| 334 |
+
typer.echo(f"Decoder output shape: {decoder_ref.shape}")
|
| 335 |
+
typer.echo(f"Joint output shape: {joint_ref.shape}")
|
| 336 |
+
|
| 337 |
+
# === Export Preprocessor ===
|
| 338 |
+
typer.echo("Tracing and converting preprocessor…")
|
| 339 |
+
preprocessor = preprocessor.cpu()
|
| 340 |
+
audio_tensor = audio_tensor.cpu()
|
| 341 |
+
audio_length = audio_length.cpu()
|
| 342 |
+
traced_preprocessor = torch.jit.trace(
|
| 343 |
+
preprocessor, (audio_tensor, audio_length), strict=False
|
| 344 |
+
)
|
| 345 |
+
traced_preprocessor.eval()
|
| 346 |
+
preprocessor_inputs = [
|
| 347 |
+
ct.TensorType(
|
| 348 |
+
name="audio_signal",
|
| 349 |
+
shape=(1, ct.RangeDim(1, max_samples)),
|
| 350 |
+
dtype=np.float32,
|
| 351 |
+
),
|
| 352 |
+
ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
|
| 353 |
+
]
|
| 354 |
+
preprocessor_outputs = [
|
| 355 |
+
ct.TensorType(name="mel", dtype=np.float32),
|
| 356 |
+
ct.TensorType(name="mel_length", dtype=np.int32),
|
| 357 |
+
]
|
| 358 |
+
preprocessor_model = _coreml_convert(
|
| 359 |
+
traced_preprocessor,
|
| 360 |
+
preprocessor_inputs,
|
| 361 |
+
preprocessor_outputs,
|
| 362 |
+
export_settings,
|
| 363 |
+
compute_units_override=pre_cu,
|
| 364 |
+
)
|
| 365 |
+
preprocessor_path = output_dir / "parakeet_eou_preprocessor.mlpackage"
|
| 366 |
+
_save_mlpackage(
|
| 367 |
+
preprocessor_model,
|
| 368 |
+
preprocessor_path,
|
| 369 |
+
f"Parakeet EOU preprocessor ({max_audio_seconds}s window)",
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
# === Export Encoder ===
|
| 373 |
+
typer.echo("Tracing and converting encoder…")
|
| 374 |
+
traced_encoder = torch.jit.trace(
|
| 375 |
+
encoder, (mel_ref, mel_length_ref), strict=False
|
| 376 |
+
)
|
| 377 |
+
traced_encoder.eval()
|
| 378 |
+
encoder_inputs = [
|
| 379 |
+
ct.TensorType(
|
| 380 |
+
name="mel", shape=_tensor_shape(mel_ref), dtype=np.float32
|
| 381 |
+
),
|
| 382 |
+
ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
|
| 383 |
+
]
|
| 384 |
+
encoder_outputs = [
|
| 385 |
+
ct.TensorType(name="encoder", dtype=np.float32),
|
| 386 |
+
ct.TensorType(name="encoder_length", dtype=np.int32),
|
| 387 |
+
ct.TensorType(name="frame_times", dtype=np.float32),
|
| 388 |
+
]
|
| 389 |
+
encoder_model = _coreml_convert(
|
| 390 |
+
traced_encoder,
|
| 391 |
+
encoder_inputs,
|
| 392 |
+
encoder_outputs,
|
| 393 |
+
export_settings,
|
| 394 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 395 |
+
)
|
| 396 |
+
encoder_path = output_dir / "parakeet_eou_encoder.mlpackage"
|
| 397 |
+
_save_mlpackage(
|
| 398 |
+
encoder_model,
|
| 399 |
+
encoder_path,
|
| 400 |
+
f"Parakeet EOU encoder ({max_audio_seconds}s window)",
|
| 401 |
+
)
|
| 402 |
+
|
| 403 |
+
# === Export Fused Mel+Encoder ===
|
| 404 |
+
typer.echo("Tracing and converting fused mel+encoder…")
|
| 405 |
+
mel_encoder = MelEncoderWrapper(preprocessor, encoder)
|
| 406 |
+
traced_mel_encoder = torch.jit.trace(
|
| 407 |
+
mel_encoder, (audio_tensor, audio_length), strict=False
|
| 408 |
+
)
|
| 409 |
+
traced_mel_encoder.eval()
|
| 410 |
+
mel_encoder_inputs = [
|
| 411 |
+
ct.TensorType(
|
| 412 |
+
name="audio_signal", shape=(1, max_samples), dtype=np.float32
|
| 413 |
+
),
|
| 414 |
+
ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
|
| 415 |
+
]
|
| 416 |
+
mel_encoder_outputs = [
|
| 417 |
+
ct.TensorType(name="encoder", dtype=np.float32),
|
| 418 |
+
ct.TensorType(name="encoder_length", dtype=np.int32),
|
| 419 |
+
ct.TensorType(name="frame_times", dtype=np.float32),
|
| 420 |
+
]
|
| 421 |
+
mel_encoder_model = _coreml_convert(
|
| 422 |
+
traced_mel_encoder,
|
| 423 |
+
mel_encoder_inputs,
|
| 424 |
+
mel_encoder_outputs,
|
| 425 |
+
export_settings,
|
| 426 |
+
compute_units_override=melenc_cu,
|
| 427 |
+
)
|
| 428 |
+
mel_encoder_path = output_dir / "parakeet_eou_mel_encoder.mlpackage"
|
| 429 |
+
_save_mlpackage(
|
| 430 |
+
mel_encoder_model,
|
| 431 |
+
mel_encoder_path,
|
| 432 |
+
f"Parakeet EOU fused Mel+Encoder ({max_audio_seconds}s window)",
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
# === Export Decoder ===
|
| 436 |
+
typer.echo("Tracing and converting decoder…")
|
| 437 |
+
traced_decoder = torch.jit.trace(
|
| 438 |
+
decoder,
|
| 439 |
+
(targets, target_lengths, zero_state, zero_state),
|
| 440 |
+
strict=False,
|
| 441 |
+
)
|
| 442 |
+
traced_decoder.eval()
|
| 443 |
+
decoder_inputs = [
|
| 444 |
+
ct.TensorType(
|
| 445 |
+
name="targets", shape=_tensor_shape(targets), dtype=np.int32
|
| 446 |
+
),
|
| 447 |
+
ct.TensorType(name="target_length", shape=(1,), dtype=np.int32),
|
| 448 |
+
ct.TensorType(
|
| 449 |
+
name="h_in", shape=_tensor_shape(zero_state), dtype=np.float32
|
| 450 |
+
),
|
| 451 |
+
ct.TensorType(
|
| 452 |
+
name="c_in", shape=_tensor_shape(zero_state), dtype=np.float32
|
| 453 |
+
),
|
| 454 |
+
]
|
| 455 |
+
decoder_outputs = [
|
| 456 |
+
ct.TensorType(name="decoder", dtype=np.float32),
|
| 457 |
+
ct.TensorType(name="h_out", dtype=np.float32),
|
| 458 |
+
ct.TensorType(name="c_out", dtype=np.float32),
|
| 459 |
+
]
|
| 460 |
+
decoder_model = _coreml_convert(
|
| 461 |
+
traced_decoder,
|
| 462 |
+
decoder_inputs,
|
| 463 |
+
decoder_outputs,
|
| 464 |
+
export_settings,
|
| 465 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 466 |
+
)
|
| 467 |
+
decoder_path = output_dir / "parakeet_eou_decoder.mlpackage"
|
| 468 |
+
_save_mlpackage(
|
| 469 |
+
decoder_model,
|
| 470 |
+
decoder_path,
|
| 471 |
+
"Parakeet EOU decoder (RNNT prediction network)",
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
# === Export Joint ===
|
| 475 |
+
typer.echo("Tracing and converting joint…")
|
| 476 |
+
traced_joint = torch.jit.trace(
|
| 477 |
+
joint,
|
| 478 |
+
(encoder_ref, decoder_ref),
|
| 479 |
+
strict=False,
|
| 480 |
+
)
|
| 481 |
+
traced_joint.eval()
|
| 482 |
+
joint_inputs = [
|
| 483 |
+
ct.TensorType(
|
| 484 |
+
name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32
|
| 485 |
+
),
|
| 486 |
+
ct.TensorType(
|
| 487 |
+
name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32
|
| 488 |
+
),
|
| 489 |
+
]
|
| 490 |
+
joint_outputs = [
|
| 491 |
+
ct.TensorType(name="logits", dtype=np.float32),
|
| 492 |
+
]
|
| 493 |
+
joint_model = _coreml_convert(
|
| 494 |
+
traced_joint,
|
| 495 |
+
joint_inputs,
|
| 496 |
+
joint_outputs,
|
| 497 |
+
export_settings,
|
| 498 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 499 |
+
)
|
| 500 |
+
joint_path = output_dir / "parakeet_eou_joint.mlpackage"
|
| 501 |
+
_save_mlpackage(
|
| 502 |
+
joint_model,
|
| 503 |
+
joint_path,
|
| 504 |
+
"Parakeet EOU joint network (RNNT)",
|
| 505 |
+
)
|
| 506 |
+
|
| 507 |
+
# === Export Joint Decision Head ===
|
| 508 |
+
typer.echo("Tracing and converting joint decision head…")
|
| 509 |
+
joint_decision = JointDecisionWrapper(joint, vocab_size=vocab_size)
|
| 510 |
+
traced_joint_decision = torch.jit.trace(
|
| 511 |
+
joint_decision,
|
| 512 |
+
(encoder_ref, decoder_ref),
|
| 513 |
+
strict=False,
|
| 514 |
+
)
|
| 515 |
+
traced_joint_decision.eval()
|
| 516 |
+
joint_decision_inputs = [
|
| 517 |
+
ct.TensorType(
|
| 518 |
+
name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32
|
| 519 |
+
),
|
| 520 |
+
ct.TensorType(
|
| 521 |
+
name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32
|
| 522 |
+
),
|
| 523 |
+
]
|
| 524 |
+
joint_decision_outputs = [
|
| 525 |
+
ct.TensorType(name="token_id", dtype=np.int32),
|
| 526 |
+
ct.TensorType(name="token_prob", dtype=np.float32),
|
| 527 |
+
]
|
| 528 |
+
joint_decision_model = _coreml_convert(
|
| 529 |
+
traced_joint_decision,
|
| 530 |
+
joint_decision_inputs,
|
| 531 |
+
joint_decision_outputs,
|
| 532 |
+
export_settings,
|
| 533 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 534 |
+
)
|
| 535 |
+
joint_decision_path = output_dir / "parakeet_eou_joint_decision.mlpackage"
|
| 536 |
+
_save_mlpackage(
|
| 537 |
+
joint_decision_model,
|
| 538 |
+
joint_decision_path,
|
| 539 |
+
"Parakeet EOU joint + decision head (softmax, argmax)",
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
# === Export Single-Step Joint Decision ===
|
| 543 |
+
typer.echo("Tracing and converting single-step joint decision…")
|
| 544 |
+
jd_single = JointDecisionSingleStep(joint, vocab_size=vocab_size)
|
| 545 |
+
# Create single-step slices from refs
|
| 546 |
+
enc_step = encoder_ref[:, :, :1].contiguous()
|
| 547 |
+
dec_step = decoder_ref[:, :, :1].contiguous()
|
| 548 |
+
traced_jd_single = torch.jit.trace(
|
| 549 |
+
jd_single,
|
| 550 |
+
(enc_step, dec_step),
|
| 551 |
+
strict=False,
|
| 552 |
+
)
|
| 553 |
+
traced_jd_single.eval()
|
| 554 |
+
jd_single_inputs = [
|
| 555 |
+
ct.TensorType(
|
| 556 |
+
name="encoder_step",
|
| 557 |
+
shape=(1, enc_step.shape[1], 1),
|
| 558 |
+
dtype=np.float32,
|
| 559 |
+
),
|
| 560 |
+
ct.TensorType(
|
| 561 |
+
name="decoder_step",
|
| 562 |
+
shape=(1, dec_step.shape[1], 1),
|
| 563 |
+
dtype=np.float32,
|
| 564 |
+
),
|
| 565 |
+
]
|
| 566 |
+
jd_single_outputs = [
|
| 567 |
+
ct.TensorType(name="token_id", dtype=np.int32),
|
| 568 |
+
ct.TensorType(name="token_prob", dtype=np.float32),
|
| 569 |
+
ct.TensorType(name="top_k_ids", dtype=np.int32),
|
| 570 |
+
ct.TensorType(name="top_k_logits", dtype=np.float32),
|
| 571 |
+
]
|
| 572 |
+
jd_single_model = _coreml_convert(
|
| 573 |
+
traced_jd_single,
|
| 574 |
+
jd_single_inputs,
|
| 575 |
+
jd_single_outputs,
|
| 576 |
+
export_settings,
|
| 577 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 578 |
+
)
|
| 579 |
+
jd_single_path = output_dir / "parakeet_eou_joint_decision_single_step.mlpackage"
|
| 580 |
+
_save_mlpackage(
|
| 581 |
+
jd_single_model,
|
| 582 |
+
jd_single_path,
|
| 583 |
+
"Parakeet EOU single-step joint decision (current frame)",
|
| 584 |
+
)
|
| 585 |
+
|
| 586 |
+
# === Save Metadata ===
|
| 587 |
+
metadata: Dict[str, object] = {
|
| 588 |
+
"model_id": model_id,
|
| 589 |
+
"model_name": "parakeet_realtime_eou_120m-v1",
|
| 590 |
+
"model_class": type(asr_model).__name__,
|
| 591 |
+
"encoder_class": type(asr_model.encoder).__name__,
|
| 592 |
+
"sample_rate": sample_rate,
|
| 593 |
+
"max_audio_seconds": export_settings.max_audio_seconds,
|
| 594 |
+
"max_audio_samples": max_samples,
|
| 595 |
+
"max_symbol_steps": export_settings.max_symbol_steps,
|
| 596 |
+
"vocab_size": vocab_size,
|
| 597 |
+
"vocab_with_blank": vocab_size + 1,
|
| 598 |
+
"decoder_hidden": decoder_hidden,
|
| 599 |
+
"decoder_layers": decoder_layers,
|
| 600 |
+
"num_extra_outputs": num_extra,
|
| 601 |
+
"has_eou_token": True,
|
| 602 |
+
"checkpoint": checkpoint_meta,
|
| 603 |
+
"coreml": {
|
| 604 |
+
"compute_units": export_settings.compute_units.name,
|
| 605 |
+
"compute_precision": (
|
| 606 |
+
export_settings.compute_precision.name
|
| 607 |
+
if export_settings.compute_precision is not None
|
| 608 |
+
else "FLOAT32"
|
| 609 |
+
),
|
| 610 |
+
},
|
| 611 |
+
"components": {
|
| 612 |
+
"preprocessor": {
|
| 613 |
+
"inputs": {
|
| 614 |
+
"audio_signal": [1, max_samples],
|
| 615 |
+
"audio_length": [1],
|
| 616 |
+
},
|
| 617 |
+
"outputs": {
|
| 618 |
+
"mel": list(_tensor_shape(mel_ref)),
|
| 619 |
+
"mel_length": [1],
|
| 620 |
+
},
|
| 621 |
+
"path": preprocessor_path.name,
|
| 622 |
+
},
|
| 623 |
+
"encoder": {
|
| 624 |
+
"inputs": {
|
| 625 |
+
"mel": list(_tensor_shape(mel_ref)),
|
| 626 |
+
"mel_length": [1],
|
| 627 |
+
},
|
| 628 |
+
"outputs": {
|
| 629 |
+
"encoder": list(_tensor_shape(encoder_ref)),
|
| 630 |
+
"encoder_length": [1],
|
| 631 |
+
"frame_times": [1, _tensor_shape(encoder_ref)[2]],
|
| 632 |
+
},
|
| 633 |
+
"path": encoder_path.name,
|
| 634 |
+
},
|
| 635 |
+
"mel_encoder": {
|
| 636 |
+
"inputs": {
|
| 637 |
+
"audio_signal": [1, max_samples],
|
| 638 |
+
"audio_length": [1],
|
| 639 |
+
},
|
| 640 |
+
"outputs": {
|
| 641 |
+
"encoder": list(_tensor_shape(encoder_ref)),
|
| 642 |
+
"encoder_length": [1],
|
| 643 |
+
"frame_times": [1, _tensor_shape(encoder_ref)[2]],
|
| 644 |
+
},
|
| 645 |
+
"path": mel_encoder_path.name,
|
| 646 |
+
},
|
| 647 |
+
"decoder": {
|
| 648 |
+
"inputs": {
|
| 649 |
+
"targets": list(_tensor_shape(targets)),
|
| 650 |
+
"target_length": [1],
|
| 651 |
+
"h_in": list(_tensor_shape(zero_state)),
|
| 652 |
+
"c_in": list(_tensor_shape(zero_state)),
|
| 653 |
+
},
|
| 654 |
+
"outputs": {
|
| 655 |
+
"decoder": list(_tensor_shape(decoder_ref)),
|
| 656 |
+
"h_out": list(_tensor_shape(h_ref)),
|
| 657 |
+
"c_out": list(_tensor_shape(c_ref)),
|
| 658 |
+
},
|
| 659 |
+
"path": decoder_path.name,
|
| 660 |
+
},
|
| 661 |
+
"joint": {
|
| 662 |
+
"inputs": {
|
| 663 |
+
"encoder": list(_tensor_shape(encoder_ref)),
|
| 664 |
+
"decoder": list(_tensor_shape(decoder_ref)),
|
| 665 |
+
},
|
| 666 |
+
"outputs": {
|
| 667 |
+
"logits": list(_tensor_shape(joint_ref)),
|
| 668 |
+
},
|
| 669 |
+
"path": joint_path.name,
|
| 670 |
+
},
|
| 671 |
+
"joint_decision": {
|
| 672 |
+
"inputs": {
|
| 673 |
+
"encoder": list(_tensor_shape(encoder_ref)),
|
| 674 |
+
"decoder": list(_tensor_shape(decoder_ref)),
|
| 675 |
+
},
|
| 676 |
+
"outputs": {
|
| 677 |
+
"token_id": [
|
| 678 |
+
_tensor_shape(encoder_ref)[0],
|
| 679 |
+
_tensor_shape(encoder_ref)[2],
|
| 680 |
+
_tensor_shape(decoder_ref)[2],
|
| 681 |
+
],
|
| 682 |
+
"token_prob": [
|
| 683 |
+
_tensor_shape(encoder_ref)[0],
|
| 684 |
+
_tensor_shape(encoder_ref)[2],
|
| 685 |
+
_tensor_shape(decoder_ref)[2],
|
| 686 |
+
],
|
| 687 |
+
},
|
| 688 |
+
"path": joint_decision_path.name,
|
| 689 |
+
},
|
| 690 |
+
"joint_decision_single_step": {
|
| 691 |
+
"inputs": {
|
| 692 |
+
"encoder_step": [1, _tensor_shape(encoder_ref)[1], 1],
|
| 693 |
+
"decoder_step": [1, _tensor_shape(decoder_ref)[1], 1],
|
| 694 |
+
},
|
| 695 |
+
"outputs": {
|
| 696 |
+
"token_id": [1, 1, 1],
|
| 697 |
+
"token_prob": [1, 1, 1],
|
| 698 |
+
"top_k_ids": [1, 1, 1, 64],
|
| 699 |
+
"top_k_logits": [1, 1, 1, 64],
|
| 700 |
+
},
|
| 701 |
+
"path": jd_single_path.name,
|
| 702 |
+
},
|
| 703 |
+
},
|
| 704 |
+
}
|
| 705 |
+
|
| 706 |
+
# Export tokenizer vocab if available
|
| 707 |
+
try:
|
| 708 |
+
tokenizer = asr_model.tokenizer
|
| 709 |
+
vocab = {
|
| 710 |
+
"blank_id": int(asr_model.decoder.blank_idx),
|
| 711 |
+
"vocab_size": vocab_size,
|
| 712 |
+
}
|
| 713 |
+
# Try to get special tokens
|
| 714 |
+
if hasattr(tokenizer, "tokenizer"):
|
| 715 |
+
inner_tokenizer = tokenizer.tokenizer
|
| 716 |
+
if hasattr(inner_tokenizer, "get_vocab"):
|
| 717 |
+
full_vocab = inner_tokenizer.get_vocab()
|
| 718 |
+
# Find EOU token
|
| 719 |
+
eou_token = None
|
| 720 |
+
for token, idx in full_vocab.items():
|
| 721 |
+
if "<EOU>" in token.upper() or "eou" in token.lower():
|
| 722 |
+
eou_token = {"token": token, "id": idx}
|
| 723 |
+
break
|
| 724 |
+
if eou_token:
|
| 725 |
+
vocab["eou_token"] = eou_token
|
| 726 |
+
metadata["tokenizer"] = vocab
|
| 727 |
+
except Exception as e:
|
| 728 |
+
typer.echo(f"Warning: Could not export tokenizer info: {e}")
|
| 729 |
+
|
| 730 |
+
metadata_path = output_dir / "metadata.json"
|
| 731 |
+
metadata_path.write_text(json.dumps(metadata, indent=2))
|
| 732 |
+
typer.echo(f"\nExport complete. Metadata written to {metadata_path}")
|
| 733 |
+
typer.echo(f"Output directory: {output_dir}")
|
| 734 |
+
|
| 735 |
+
finally:
|
| 736 |
+
asr_model.decoder._rnnt_export = decoder_export_flag
|
| 737 |
+
|
| 738 |
+
|
| 739 |
+
if __name__ == "__main__":
|
| 740 |
+
app()
|
1280ms/convert_streaming_encoder.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import coremltools as ct
|
| 5 |
+
import numpy as np
|
| 6 |
+
import typer
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Tuple, List, Optional
|
| 9 |
+
import json
|
| 10 |
+
import shutil
|
| 11 |
+
|
| 12 |
+
# Iimport torch
|
| 13 |
+
import coremltools as ct
|
| 14 |
+
import numpy as np
|
| 15 |
+
import argparse
|
| 16 |
+
from nemo.collections.asr.models import EncDecRNNTBPEModel
|
| 17 |
+
|
| 18 |
+
app = typer.Typer()
|
| 19 |
+
|
| 20 |
+
class LoopbackEncoderWrapper(nn.Module):
|
| 21 |
+
"""
|
| 22 |
+
Wraps the entire Parakeet Encoder (PreEncode + Conformer) for CoreML Loopback Streaming.
|
| 23 |
+
|
| 24 |
+
Inputs:
|
| 25 |
+
- audio_signal: [B, D, T] (Mel spectrogram chunk)
|
| 26 |
+
- audio_length: [B]
|
| 27 |
+
- pre_cache: [B, D, pre_cache_size] (Previous audio context)
|
| 28 |
+
- cache_last_channel: [layers, B, cache_size, hidden]
|
| 29 |
+
- cache_last_time: [layers, B, hidden, time_cache]
|
| 30 |
+
- cache_last_channel_len: [B]
|
| 31 |
+
|
| 32 |
+
Outputs:
|
| 33 |
+
- encoded_output: [B, D_out, T_out]
|
| 34 |
+
- encoded_length: [B]
|
| 35 |
+
- new_pre_cache: [B, D, pre_cache_size]
|
| 36 |
+
- new_cache_last_channel
|
| 37 |
+
- new_cache_last_time
|
| 38 |
+
- new_cache_last_channel_len
|
| 39 |
+
"""
|
| 40 |
+
def __init__(self, encoder, pre_cache_size=16):
|
| 41 |
+
super().__init__()
|
| 42 |
+
self.encoder = encoder
|
| 43 |
+
self.pre_cache_size = pre_cache_size
|
| 44 |
+
|
| 45 |
+
def forward(
|
| 46 |
+
self,
|
| 47 |
+
audio_signal: torch.Tensor,
|
| 48 |
+
audio_length: torch.Tensor,
|
| 49 |
+
pre_cache: torch.Tensor,
|
| 50 |
+
cache_last_channel: torch.Tensor,
|
| 51 |
+
cache_last_time: torch.Tensor,
|
| 52 |
+
cache_last_channel_len: torch.Tensor
|
| 53 |
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 54 |
+
|
| 55 |
+
# 1. Prepend pre_cache to audio_signal
|
| 56 |
+
# audio_signal: [B, D, T]
|
| 57 |
+
# pre_cache: [B, D, T_cache]
|
| 58 |
+
full_input = torch.cat([pre_cache, audio_signal], dim=2)
|
| 59 |
+
full_length = audio_length + self.pre_cache_size
|
| 60 |
+
|
| 61 |
+
# 2. Extract NEW pre_cache (last N frames of full_input)
|
| 62 |
+
# Note: We do this BEFORE processing because we want the raw audio context
|
| 63 |
+
new_pre_cache = full_input[:, :, -self.pre_cache_size:]
|
| 64 |
+
|
| 65 |
+
# 3. Process with Encoder
|
| 66 |
+
# Reconstruct NeMo cache object
|
| 67 |
+
current_cache = [cache_last_channel, cache_last_time, cache_last_channel_len]
|
| 68 |
+
|
| 69 |
+
encoded, encoded_len, new_cache_channel, new_cache_time, new_cache_len = self.encoder.cache_aware_stream_step(
|
| 70 |
+
processed_signal=full_input,
|
| 71 |
+
processed_signal_length=full_length,
|
| 72 |
+
cache_last_channel=cache_last_channel,
|
| 73 |
+
cache_last_time=cache_last_time,
|
| 74 |
+
cache_last_channel_len=cache_last_channel_len
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# 4. Drop the first few frames corresponding to pre_cache?
|
| 78 |
+
# NeMo's cache_aware_stream_step usually handles the "valid" output frames.
|
| 79 |
+
# But since we manually prepended, we might get extra output frames.
|
| 80 |
+
# However, for streaming, we usually want the model to see the context but only output the new tokens.
|
| 81 |
+
# Let's trust NeMo's streaming logic for now, or check if we need to slice.
|
| 82 |
+
# Given we are using 'cache_aware_stream_step', it expects the full context window?
|
| 83 |
+
# Actually, standard usage is: input IS the new chunk, but internal convolution looks at past.
|
| 84 |
+
# But since we are stateless, we MUST provide the past.
|
| 85 |
+
# So passing (pre_cache + chunk) is correct.
|
| 86 |
+
|
| 87 |
+
# Cast lengths to Int32 for CoreML
|
| 88 |
+
encoded_len_32 = encoded_len.to(dtype=torch.int32)
|
| 89 |
+
new_channel_len_32 = new_cache_len.to(dtype=torch.int32)
|
| 90 |
+
|
| 91 |
+
return encoded, encoded_len_32, new_pre_cache, new_cache_channel, new_cache_time, new_channel_len_32
|
| 92 |
+
|
| 93 |
+
def _coreml_convert(
|
| 94 |
+
traced_model,
|
| 95 |
+
inputs,
|
| 96 |
+
outputs,
|
| 97 |
+
compute_units=ct.ComputeUnit.CPU_ONLY
|
| 98 |
+
):
|
| 99 |
+
return ct.convert(
|
| 100 |
+
traced_model,
|
| 101 |
+
inputs=inputs,
|
| 102 |
+
outputs=outputs,
|
| 103 |
+
compute_units=compute_units,
|
| 104 |
+
minimum_deployment_target=ct.target.macOS14,
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
def main():
|
| 108 |
+
model_id: str = "nvidia/parakeet_realtime_eou_120m-v1"
|
| 109 |
+
output_dir: str = "temp_swift_models/StreamingLoopback"
|
| 110 |
+
output_path = Path(output_dir)
|
| 111 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
| 112 |
+
|
| 113 |
+
print(f"Loading model: {model_id}...")
|
| 114 |
+
asr_model = EncDecRNNTBPEModel.from_pretrained(model_name=model_id)
|
| 115 |
+
asr_model.eval()
|
| 116 |
+
|
| 117 |
+
parser = argparse.ArgumentParser()
|
| 118 |
+
parser.add_argument("--chunk-frames", type=int, default=17, help="Number of frames in the input chunk (e.g. 17 for 160ms, 129 for 1.28s)")
|
| 119 |
+
args = parser.parse_args()
|
| 120 |
+
|
| 121 |
+
encoder = asr_model.encoder
|
| 122 |
+
|
| 123 |
+
# --- Configuration ---
|
| 124 |
+
# 160ms chunk = 16 frames (but preprocessor produces 17 with padding/centering)
|
| 125 |
+
# 1.28s chunk = 128 frames (preprocessor produces 129)
|
| 126 |
+
chunk_size_in = args.chunk_frames
|
| 127 |
+
mel_dim = 128
|
| 128 |
+
hidden_dim = encoder.d_model # 512
|
| 129 |
+
num_layers = len(encoder.layers) # 17
|
| 130 |
+
|
| 131 |
+
# Cache sizes
|
| 132 |
+
cache_channel_size = 70
|
| 133 |
+
cache_time_size = 8
|
| 134 |
+
pre_cache_size = 16
|
| 135 |
+
|
| 136 |
+
print(f"Config: Chunk={chunk_size_in}, Mel={mel_dim}, Hidden={hidden_dim}, Layers={num_layers}")
|
| 137 |
+
print(f"Cache: Channel={cache_channel_size}, Time={cache_time_size}, Pre={pre_cache_size}")
|
| 138 |
+
|
| 139 |
+
# --- Wrapper ---
|
| 140 |
+
wrapper = LoopbackEncoderWrapper(encoder, pre_cache_size=pre_cache_size)
|
| 141 |
+
wrapper.eval()
|
| 142 |
+
|
| 143 |
+
# --- Test Inputs (for Tracing) ---
|
| 144 |
+
batch_size = 1
|
| 145 |
+
test_mel = torch.randn(batch_size, mel_dim, chunk_size_in)
|
| 146 |
+
test_mel_len = torch.tensor([chunk_size_in], dtype=torch.int32)
|
| 147 |
+
test_pre_cache = torch.zeros(batch_size, mel_dim, pre_cache_size)
|
| 148 |
+
|
| 149 |
+
# Initial Cache (Zeros)
|
| 150 |
+
test_cache_channel = torch.zeros(num_layers, batch_size, cache_channel_size, hidden_dim)
|
| 151 |
+
test_cache_time = torch.zeros(num_layers, batch_size, hidden_dim, cache_time_size)
|
| 152 |
+
test_cache_len = torch.zeros(batch_size, dtype=torch.int32)
|
| 153 |
+
|
| 154 |
+
print("Tracing model...")
|
| 155 |
+
traced_model = torch.jit.trace(
|
| 156 |
+
wrapper,
|
| 157 |
+
(test_mel, test_mel_len, test_pre_cache, test_cache_channel, test_cache_time, test_cache_len),
|
| 158 |
+
strict=False
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# --- CoreML Conversion ---
|
| 162 |
+
print("Converting to CoreML...")
|
| 163 |
+
|
| 164 |
+
inputs = [
|
| 165 |
+
ct.TensorType(name="audio_signal", shape=(1, 128, chunk_size_in), dtype=np.float32),
|
| 166 |
+
ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
|
| 167 |
+
ct.TensorType(name="pre_cache", shape=(1, 128, pre_cache_size), dtype=np.float32),
|
| 168 |
+
ct.TensorType(name="cache_last_channel", shape=(num_layers, 1, cache_channel_size, hidden_dim), dtype=np.float32),
|
| 169 |
+
ct.TensorType(name="cache_last_time", shape=(num_layers, 1, hidden_dim, cache_time_size), dtype=np.float32),
|
| 170 |
+
ct.TensorType(name="cache_last_channel_len", shape=(1,), dtype=np.int32),
|
| 171 |
+
]
|
| 172 |
+
|
| 173 |
+
outputs = [
|
| 174 |
+
ct.TensorType(name="encoded_output", dtype=np.float32),
|
| 175 |
+
ct.TensorType(name="encoded_length", dtype=np.int32),
|
| 176 |
+
ct.TensorType(name="new_pre_cache", dtype=np.float32),
|
| 177 |
+
ct.TensorType(name="new_cache_last_channel", dtype=np.float32),
|
| 178 |
+
ct.TensorType(name="new_cache_last_time", dtype=np.float32),
|
| 179 |
+
ct.TensorType(name="new_cache_last_channel_len", dtype=np.int32),
|
| 180 |
+
]
|
| 181 |
+
|
| 182 |
+
mlmodel = _coreml_convert(traced_model, inputs, outputs)
|
| 183 |
+
|
| 184 |
+
save_path = output_path / "streaming_encoder.mlpackage"
|
| 185 |
+
mlmodel.save(str(save_path))
|
| 186 |
+
print(f"Saved: {save_path}")
|
| 187 |
+
|
| 188 |
+
# Also export Preprocessor, Decoder, Joint for completeness?
|
| 189 |
+
# For now, let's assume we reuse the existing ones or export them separately if needed.
|
| 190 |
+
# But the user asked specifically for the Encoder loopback.
|
| 191 |
+
|
| 192 |
+
if __name__ == "__main__":
|
| 193 |
+
main()
|
1280ms/decoder.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3996975a8cbc1949159c55605b3132b39b2484f51acbd55d796d93c70de02b49
|
| 3 |
+
size 243
|
1280ms/decoder.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3ccbff963d8cf07e2be2bd56ea3384a89ea49628922c6bd95ff62e2ae57dc34
|
| 3 |
+
size 497
|
1280ms/decoder.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"shortDescription" : "Parakeet EOU decoder (RNNT prediction network)",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32 1 × 640 × 1)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 640, 1]",
|
| 13 |
+
"name" : "decoder",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Float32",
|
| 20 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 640)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[1, 1, 640]",
|
| 23 |
+
"name" : "h_out",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"hasShapeFlexibility" : "0",
|
| 28 |
+
"isOptional" : "0",
|
| 29 |
+
"dataType" : "Float32",
|
| 30 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 640)",
|
| 31 |
+
"shortDescription" : "",
|
| 32 |
+
"shape" : "[1, 1, 640]",
|
| 33 |
+
"name" : "c_out",
|
| 34 |
+
"type" : "MultiArray"
|
| 35 |
+
}
|
| 36 |
+
],
|
| 37 |
+
"storagePrecision" : "Float16",
|
| 38 |
+
"modelParameters" : [
|
| 39 |
+
|
| 40 |
+
],
|
| 41 |
+
"author" : "Fluid Inference",
|
| 42 |
+
"specificationVersion" : 8,
|
| 43 |
+
"mlProgramOperationTypeHistogram" : {
|
| 44 |
+
"Ios17.squeeze" : 2,
|
| 45 |
+
"Ios17.gather" : 1,
|
| 46 |
+
"Ios17.cast" : 6,
|
| 47 |
+
"Ios17.lstm" : 1,
|
| 48 |
+
"Ios17.transpose" : 2,
|
| 49 |
+
"Identity" : 1,
|
| 50 |
+
"Ios17.expandDims" : 2
|
| 51 |
+
},
|
| 52 |
+
"computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
|
| 53 |
+
"isUpdatable" : "0",
|
| 54 |
+
"stateSchema" : [
|
| 55 |
+
|
| 56 |
+
],
|
| 57 |
+
"availability" : {
|
| 58 |
+
"macOS" : "14.0",
|
| 59 |
+
"tvOS" : "17.0",
|
| 60 |
+
"visionOS" : "1.0",
|
| 61 |
+
"watchOS" : "10.0",
|
| 62 |
+
"iOS" : "17.0",
|
| 63 |
+
"macCatalyst" : "17.0"
|
| 64 |
+
},
|
| 65 |
+
"modelType" : {
|
| 66 |
+
"name" : "MLModelType_mlProgram"
|
| 67 |
+
},
|
| 68 |
+
"inputSchema" : [
|
| 69 |
+
{
|
| 70 |
+
"hasShapeFlexibility" : "0",
|
| 71 |
+
"isOptional" : "0",
|
| 72 |
+
"dataType" : "Int32",
|
| 73 |
+
"formattedType" : "MultiArray (Int32 1 × 1)",
|
| 74 |
+
"shortDescription" : "",
|
| 75 |
+
"shape" : "[1, 1]",
|
| 76 |
+
"name" : "targets",
|
| 77 |
+
"type" : "MultiArray"
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"hasShapeFlexibility" : "0",
|
| 81 |
+
"isOptional" : "0",
|
| 82 |
+
"dataType" : "Int32",
|
| 83 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 84 |
+
"shortDescription" : "",
|
| 85 |
+
"shape" : "[1]",
|
| 86 |
+
"name" : "target_length",
|
| 87 |
+
"type" : "MultiArray"
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"hasShapeFlexibility" : "0",
|
| 91 |
+
"isOptional" : "0",
|
| 92 |
+
"dataType" : "Float32",
|
| 93 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 640)",
|
| 94 |
+
"shortDescription" : "",
|
| 95 |
+
"shape" : "[1, 1, 640]",
|
| 96 |
+
"name" : "h_in",
|
| 97 |
+
"type" : "MultiArray"
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"hasShapeFlexibility" : "0",
|
| 101 |
+
"isOptional" : "0",
|
| 102 |
+
"dataType" : "Float32",
|
| 103 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 640)",
|
| 104 |
+
"shortDescription" : "",
|
| 105 |
+
"shape" : "[1, 1, 640]",
|
| 106 |
+
"name" : "c_in",
|
| 107 |
+
"type" : "MultiArray"
|
| 108 |
+
}
|
| 109 |
+
],
|
| 110 |
+
"userDefinedMetadata" : {
|
| 111 |
+
"com.github.apple.coremltools.version" : "8.3.0",
|
| 112 |
+
"com.github.apple.coremltools.source" : "torch==2.4.0",
|
| 113 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript"
|
| 114 |
+
},
|
| 115 |
+
"generatedClassName" : "parakeet_eou_decoder",
|
| 116 |
+
"method" : "predict"
|
| 117 |
+
}
|
| 118 |
+
]
|
1280ms/decoder.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios17>(tensor<fp32, [1, 1, 640]> c_in, tensor<fp32, [1, 1, 640]> h_in, tensor<int32, [1]> target_length, tensor<int32, [1, 1]> targets) {
|
| 5 |
+
tensor<int32, []> y_axis_0 = const()[name = tensor<string, []>("y_axis_0"), val = tensor<int32, []>(0)];
|
| 6 |
+
tensor<int32, []> y_batch_dims_0 = const()[name = tensor<string, []>("y_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 7 |
+
tensor<bool, []> y_validate_indices_0 = const()[name = tensor<string, []>("y_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 8 |
+
tensor<fp16, [1027, 640]> module_prediction_embed_weight_to_fp16 = const()[name = tensor<string, []>("module_prediction_embed_weight_to_fp16"), val = tensor<fp16, [1027, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 9 |
+
tensor<string, []> targets_to_int16_dtype_0 = const()[name = tensor<string, []>("targets_to_int16_dtype_0"), val = tensor<string, []>("int16")];
|
| 10 |
+
tensor<int16, [1, 1]> targets_to_int16 = cast(dtype = targets_to_int16_dtype_0, x = targets)[name = tensor<string, []>("cast_8")];
|
| 11 |
+
tensor<fp16, [1, 1, 640]> y_cast_fp16_cast_uint16 = gather(axis = y_axis_0, batch_dims = y_batch_dims_0, indices = targets_to_int16, validate_indices = y_validate_indices_0, x = module_prediction_embed_weight_to_fp16)[name = tensor<string, []>("y_cast_fp16_cast_uint16")];
|
| 12 |
+
tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
|
| 13 |
+
tensor<int32, [1]> input_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
|
| 14 |
+
tensor<string, []> h_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("h_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 15 |
+
tensor<fp16, [1, 1, 640]> h_in_to_fp16 = cast(dtype = h_in_to_fp16_dtype_0, x = h_in)[name = tensor<string, []>("cast_7")];
|
| 16 |
+
tensor<fp16, [1, 640]> input_lstm_h0_squeeze_cast_fp16 = squeeze(axes = input_lstm_h0_squeeze_axes_0, x = h_in_to_fp16)[name = tensor<string, []>("input_lstm_h0_squeeze_cast_fp16")];
|
| 17 |
+
tensor<int32, [1]> input_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
|
| 18 |
+
tensor<string, []> c_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("c_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 19 |
+
tensor<fp16, [1, 1, 640]> c_in_to_fp16 = cast(dtype = c_in_to_fp16_dtype_0, x = c_in)[name = tensor<string, []>("cast_6")];
|
| 20 |
+
tensor<fp16, [1, 640]> input_lstm_c0_squeeze_cast_fp16 = squeeze(axes = input_lstm_c0_squeeze_axes_0, x = c_in_to_fp16)[name = tensor<string, []>("input_lstm_c0_squeeze_cast_fp16")];
|
| 21 |
+
tensor<string, []> input_direction_0 = const()[name = tensor<string, []>("input_direction_0"), val = tensor<string, []>("forward")];
|
| 22 |
+
tensor<bool, []> input_output_sequence_0 = const()[name = tensor<string, []>("input_output_sequence_0"), val = tensor<bool, []>(true)];
|
| 23 |
+
tensor<string, []> input_recurrent_activation_0 = const()[name = tensor<string, []>("input_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
|
| 24 |
+
tensor<string, []> input_cell_activation_0 = const()[name = tensor<string, []>("input_cell_activation_0"), val = tensor<string, []>("tanh")];
|
| 25 |
+
tensor<string, []> input_activation_0 = const()[name = tensor<string, []>("input_activation_0"), val = tensor<string, []>("tanh")];
|
| 26 |
+
tensor<fp16, [2560, 640]> concat_1_to_fp16 = const()[name = tensor<string, []>("concat_1_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1314688)))];
|
| 27 |
+
tensor<fp16, [2560, 640]> concat_2_to_fp16 = const()[name = tensor<string, []>("concat_2_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4591552)))];
|
| 28 |
+
tensor<fp16, [2560]> concat_0_to_fp16 = const()[name = tensor<string, []>("concat_0_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7868416)))];
|
| 29 |
+
tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = y_cast_fp16_cast_uint16)[name = tensor<string, []>("transpose_2")];
|
| 30 |
+
tensor<fp16, [1, 1, 640]> input_cast_fp16_0, tensor<fp16, [1, 640]> input_cast_fp16_1, tensor<fp16, [1, 640]> input_cast_fp16_2 = lstm(activation = input_activation_0, bias = concat_0_to_fp16, cell_activation = input_cell_activation_0, direction = input_direction_0, initial_c = input_lstm_c0_squeeze_cast_fp16, initial_h = input_lstm_h0_squeeze_cast_fp16, output_sequence = input_output_sequence_0, recurrent_activation = input_recurrent_activation_0, weight_hh = concat_2_to_fp16, weight_ih = concat_1_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
|
| 31 |
+
tensor<int32, [1]> obj_3_axes_0 = const()[name = tensor<string, []>("obj_3_axes_0"), val = tensor<int32, [1]>([0])];
|
| 32 |
+
tensor<fp16, [1, 1, 640]> obj_3_cast_fp16 = expand_dims(axes = obj_3_axes_0, x = input_cast_fp16_1)[name = tensor<string, []>("obj_3_cast_fp16")];
|
| 33 |
+
tensor<string, []> obj_3_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_3_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 34 |
+
tensor<int32, [1]> obj_axes_0 = const()[name = tensor<string, []>("obj_axes_0"), val = tensor<int32, [1]>([0])];
|
| 35 |
+
tensor<fp16, [1, 1, 640]> obj_cast_fp16 = expand_dims(axes = obj_axes_0, x = input_cast_fp16_2)[name = tensor<string, []>("obj_cast_fp16")];
|
| 36 |
+
tensor<string, []> obj_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 37 |
+
tensor<int32, [3]> transpose_0_perm_0 = const()[name = tensor<string, []>("transpose_0_perm_0"), val = tensor<int32, [3]>([1, 2, 0])];
|
| 38 |
+
tensor<string, []> transpose_0_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("transpose_0_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 39 |
+
tensor<fp16, [1, 640, 1]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = input_cast_fp16_0)[name = tensor<string, []>("transpose_1")];
|
| 40 |
+
tensor<fp32, [1, 640, 1]> decoder = cast(dtype = transpose_0_cast_fp16_to_fp32_dtype_0, x = transpose_0_cast_fp16)[name = tensor<string, []>("cast_3")];
|
| 41 |
+
tensor<fp32, [1, 1, 640]> c_out = cast(dtype = obj_cast_fp16_to_fp32_dtype_0, x = obj_cast_fp16)[name = tensor<string, []>("cast_4")];
|
| 42 |
+
tensor<fp32, [1, 1, 640]> h_out = cast(dtype = obj_3_cast_fp16_to_fp32_dtype_0, x = obj_3_cast_fp16)[name = tensor<string, []>("cast_5")];
|
| 43 |
+
tensor<int32, [1]> target_length_tmp = identity(x = target_length)[name = tensor<string, []>("target_length_tmp")];
|
| 44 |
+
} -> (decoder, h_out, c_out);
|
| 45 |
+
}
|
1280ms/decoder.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b4cacecdcd9df79ab1e56de67230baf5a8664d2afe0bb8f3408eefa972cb2f4
|
| 3 |
+
size 7873600
|
1280ms/individual_components.py
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Export Parakeet Realtime EOU 120M RNNT components into CoreML."""
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Optional, Tuple
|
| 8 |
+
|
| 9 |
+
import coremltools as ct
|
| 10 |
+
import torch
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class ExportSettings:
|
| 15 |
+
output_dir: Path
|
| 16 |
+
compute_units: ct.ComputeUnit
|
| 17 |
+
deployment_target: Optional[ct.target]
|
| 18 |
+
compute_precision: Optional[ct.precision]
|
| 19 |
+
max_audio_seconds: float
|
| 20 |
+
max_symbol_steps: int
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class PreprocessorWrapper(torch.nn.Module):
|
| 24 |
+
"""Wrapper for the audio preprocessor (mel spectrogram extraction)."""
|
| 25 |
+
|
| 26 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 27 |
+
super().__init__()
|
| 28 |
+
self.module = module
|
| 29 |
+
|
| 30 |
+
def forward(
|
| 31 |
+
self, audio_signal: torch.Tensor, length: torch.Tensor
|
| 32 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 33 |
+
mel, mel_length = self.module(
|
| 34 |
+
input_signal=audio_signal, length=length.to(dtype=torch.long)
|
| 35 |
+
)
|
| 36 |
+
return mel, mel_length
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class EncoderWrapper(torch.nn.Module):
|
| 40 |
+
"""Wrapper for the cache-aware FastConformer encoder.
|
| 41 |
+
|
| 42 |
+
Note: For the realtime EOU model, the encoder is cache-aware which means
|
| 43 |
+
it can operate in a streaming fashion. For CoreML export, we export
|
| 44 |
+
without cache state for simplicity (full-context mode).
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 48 |
+
super().__init__()
|
| 49 |
+
self.module = module
|
| 50 |
+
|
| 51 |
+
def forward(
|
| 52 |
+
self, features: torch.Tensor, length: torch.Tensor
|
| 53 |
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 54 |
+
encoded, encoded_lengths = self.module(
|
| 55 |
+
audio_signal=features, length=length.to(dtype=torch.long)
|
| 56 |
+
)
|
| 57 |
+
# Synthesize per-frame timestamps (seconds) using the 80 ms encoder stride.
|
| 58 |
+
# Shape: [B, T_enc]
|
| 59 |
+
frame_times = (
|
| 60 |
+
torch.arange(encoded.shape[-1], device=encoded.device, dtype=torch.float32)
|
| 61 |
+
* 0.08
|
| 62 |
+
)
|
| 63 |
+
return encoded, encoded_lengths, frame_times
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class DecoderWrapper(torch.nn.Module):
|
| 67 |
+
"""Wrapper for the RNNT prediction network (decoder)."""
|
| 68 |
+
|
| 69 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 70 |
+
super().__init__()
|
| 71 |
+
self.module = module
|
| 72 |
+
|
| 73 |
+
def forward(
|
| 74 |
+
self,
|
| 75 |
+
targets: torch.Tensor,
|
| 76 |
+
target_lengths: torch.Tensor,
|
| 77 |
+
h_in: torch.Tensor,
|
| 78 |
+
c_in: torch.Tensor,
|
| 79 |
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 80 |
+
state = [h_in, c_in]
|
| 81 |
+
decoder_output, _, new_state = self.module(
|
| 82 |
+
targets=targets.to(dtype=torch.long),
|
| 83 |
+
target_length=target_lengths.to(dtype=torch.long),
|
| 84 |
+
states=state,
|
| 85 |
+
)
|
| 86 |
+
return decoder_output, new_state[0], new_state[1]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
class JointWrapper(torch.nn.Module):
|
| 90 |
+
"""Wrapper for the RNNT joint network.
|
| 91 |
+
|
| 92 |
+
Note: Unlike Parakeet TDT v3, the realtime EOU model does NOT have
|
| 93 |
+
duration outputs (num_extra_outputs). The joint network outputs only
|
| 94 |
+
token logits over the vocabulary + blank.
|
| 95 |
+
"""
|
| 96 |
+
|
| 97 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 98 |
+
super().__init__()
|
| 99 |
+
self.module = module
|
| 100 |
+
|
| 101 |
+
def forward(
|
| 102 |
+
self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor
|
| 103 |
+
) -> torch.Tensor:
|
| 104 |
+
# Input: encoder_outputs [B, D, T], decoder_outputs [B, D, U]
|
| 105 |
+
# Transpose to match what projection layers expect
|
| 106 |
+
encoder_outputs = encoder_outputs.transpose(1, 2) # [B, T, D]
|
| 107 |
+
decoder_outputs = decoder_outputs.transpose(1, 2) # [B, U, D]
|
| 108 |
+
|
| 109 |
+
# Apply projections
|
| 110 |
+
enc_proj = self.module.enc(encoder_outputs) # [B, T, joint_hidden]
|
| 111 |
+
dec_proj = self.module.pred(decoder_outputs) # [B, U, joint_hidden]
|
| 112 |
+
|
| 113 |
+
# Explicit broadcasting along T and U to avoid converter ambiguity
|
| 114 |
+
x = enc_proj.unsqueeze(2) + dec_proj.unsqueeze(1) # [B, T, U, joint_hidden]
|
| 115 |
+
x = self.module.joint_net[0](x) # ReLU
|
| 116 |
+
x = self.module.joint_net[1](x) # Dropout (no-op in eval)
|
| 117 |
+
out = self.module.joint_net[2](x) # Linear -> logits [B, T, U, vocab+blank]
|
| 118 |
+
return out
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class MelEncoderWrapper(torch.nn.Module):
|
| 122 |
+
"""Fused wrapper: waveform -> mel -> encoder.
|
| 123 |
+
|
| 124 |
+
Inputs:
|
| 125 |
+
- audio_signal: [B, S]
|
| 126 |
+
- audio_length: [B]
|
| 127 |
+
|
| 128 |
+
Outputs:
|
| 129 |
+
- encoder: [B, D, T_enc]
|
| 130 |
+
- encoder_length: [B]
|
| 131 |
+
- frame_times: [T_enc]
|
| 132 |
+
"""
|
| 133 |
+
|
| 134 |
+
def __init__(
|
| 135 |
+
self, preprocessor: PreprocessorWrapper, encoder: EncoderWrapper
|
| 136 |
+
) -> None:
|
| 137 |
+
super().__init__()
|
| 138 |
+
self.preprocessor = preprocessor
|
| 139 |
+
self.encoder = encoder
|
| 140 |
+
|
| 141 |
+
def forward(
|
| 142 |
+
self, audio_signal: torch.Tensor, audio_length: torch.Tensor
|
| 143 |
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 144 |
+
mel, mel_length = self.preprocessor(audio_signal, audio_length)
|
| 145 |
+
encoded, enc_len, frame_times = self.encoder(mel, mel_length.to(dtype=torch.int32))
|
| 146 |
+
return encoded, enc_len, frame_times
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class JointDecisionWrapper(torch.nn.Module):
|
| 150 |
+
"""Joint + decision head: outputs label id and label prob.
|
| 151 |
+
|
| 152 |
+
Unlike Parakeet TDT v3, this model does NOT have duration outputs.
|
| 153 |
+
|
| 154 |
+
Inputs:
|
| 155 |
+
- encoder_outputs: [B, D, T]
|
| 156 |
+
- decoder_outputs: [B, D, U]
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
- token_id: [B, T, U] int32
|
| 160 |
+
- token_prob: [B, T, U] float32
|
| 161 |
+
"""
|
| 162 |
+
|
| 163 |
+
def __init__(self, joint: JointWrapper, vocab_size: int) -> None:
|
| 164 |
+
super().__init__()
|
| 165 |
+
self.joint = joint
|
| 166 |
+
self.vocab_with_blank = int(vocab_size) + 1
|
| 167 |
+
|
| 168 |
+
def forward(self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor):
|
| 169 |
+
logits = self.joint(encoder_outputs, decoder_outputs)
|
| 170 |
+
|
| 171 |
+
# Token selection
|
| 172 |
+
token_ids = torch.argmax(logits, dim=-1).to(dtype=torch.int32)
|
| 173 |
+
token_probs_all = torch.softmax(logits, dim=-1)
|
| 174 |
+
# gather expects int64 (long) indices; cast only for gather
|
| 175 |
+
token_prob = torch.gather(
|
| 176 |
+
token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
|
| 177 |
+
).squeeze(-1)
|
| 178 |
+
|
| 179 |
+
return token_ids, token_prob
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
class JointDecisionSingleStep(torch.nn.Module):
|
| 183 |
+
"""Single-step variant for streaming: encoder_step -> token decision.
|
| 184 |
+
|
| 185 |
+
Inputs:
|
| 186 |
+
- encoder_step: [B=1, D, T=1]
|
| 187 |
+
- decoder_step: [B=1, D, U=1]
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
- token_id: [1, 1, 1] int32
|
| 191 |
+
- token_prob: [1, 1, 1] float32
|
| 192 |
+
- top_k_ids: [1, 1, 1, K] int32
|
| 193 |
+
- top_k_logits: [1, 1, 1, K] float32
|
| 194 |
+
"""
|
| 195 |
+
|
| 196 |
+
def __init__(self, joint: JointWrapper, vocab_size: int, top_k: int = 64) -> None:
|
| 197 |
+
super().__init__()
|
| 198 |
+
self.joint = joint
|
| 199 |
+
self.vocab_with_blank = int(vocab_size) + 1
|
| 200 |
+
self.top_k = int(top_k)
|
| 201 |
+
|
| 202 |
+
def forward(self, encoder_step: torch.Tensor, decoder_step: torch.Tensor):
|
| 203 |
+
# Reuse JointWrapper which expects [B, D, T] and [B, D, U]
|
| 204 |
+
logits = self.joint(encoder_step, decoder_step) # [1, 1, 1, V+blank]
|
| 205 |
+
|
| 206 |
+
token_ids = torch.argmax(logits, dim=-1, keepdim=False).to(dtype=torch.int32)
|
| 207 |
+
token_probs_all = torch.softmax(logits, dim=-1)
|
| 208 |
+
token_prob = torch.gather(
|
| 209 |
+
token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
|
| 210 |
+
).squeeze(-1)
|
| 211 |
+
|
| 212 |
+
# Also expose top-K candidates for host-side processing
|
| 213 |
+
topk_logits, topk_ids_long = torch.topk(
|
| 214 |
+
logits, k=min(self.top_k, logits.shape[-1]), dim=-1
|
| 215 |
+
)
|
| 216 |
+
topk_ids = topk_ids_long.to(dtype=torch.int32)
|
| 217 |
+
return token_ids, token_prob, topk_ids, topk_logits
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def _coreml_convert(
|
| 221 |
+
traced: torch.jit.ScriptModule,
|
| 222 |
+
inputs,
|
| 223 |
+
outputs,
|
| 224 |
+
settings: ExportSettings,
|
| 225 |
+
compute_units_override: Optional[ct.ComputeUnit] = None,
|
| 226 |
+
compute_precision: Optional[ct.precision] = None,
|
| 227 |
+
) -> ct.models.MLModel:
|
| 228 |
+
cu = (
|
| 229 |
+
compute_units_override
|
| 230 |
+
if compute_units_override is not None
|
| 231 |
+
else settings.compute_units
|
| 232 |
+
)
|
| 233 |
+
kwargs = {
|
| 234 |
+
"convert_to": "mlprogram",
|
| 235 |
+
"inputs": inputs,
|
| 236 |
+
"outputs": outputs,
|
| 237 |
+
"compute_units": cu,
|
| 238 |
+
}
|
| 239 |
+
print("Converting:", traced.__class__.__name__)
|
| 240 |
+
print("Conversion kwargs:", kwargs)
|
| 241 |
+
if settings.deployment_target is not None:
|
| 242 |
+
kwargs["minimum_deployment_target"] = settings.deployment_target
|
| 243 |
+
|
| 244 |
+
# Priority: explicit argument > settings
|
| 245 |
+
if compute_precision is not None:
|
| 246 |
+
kwargs["compute_precision"] = compute_precision
|
| 247 |
+
elif settings.compute_precision is not None:
|
| 248 |
+
kwargs["compute_precision"] = settings.compute_precision
|
| 249 |
+
|
| 250 |
+
return ct.convert(traced, **kwargs)
|
1280ms/joint_decision.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5bca32ad130dcad6605cc00044c752aa5b45ef57d14c17f2d1a2fa49d6cf55b5
|
| 3 |
+
size 243
|
1280ms/joint_decision.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:22d4abc4625b935ee035b5f8ce7cb28d1041b9b01c12173e287bf4b5f5d99625
|
| 3 |
+
size 493
|
1280ms/joint_decision.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"shortDescription" : "Parakeet EOU single-step joint decision",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Int32",
|
| 10 |
+
"formattedType" : "MultiArray (Int32 1 × 1 × 1)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 1, 1]",
|
| 13 |
+
"name" : "token_id",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Float32",
|
| 20 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 1)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[1, 1, 1]",
|
| 23 |
+
"name" : "token_prob",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"hasShapeFlexibility" : "0",
|
| 28 |
+
"isOptional" : "0",
|
| 29 |
+
"dataType" : "Int32",
|
| 30 |
+
"formattedType" : "MultiArray (Int32 1 × 1 × 1 × 64)",
|
| 31 |
+
"shortDescription" : "",
|
| 32 |
+
"shape" : "[1, 1, 1, 64]",
|
| 33 |
+
"name" : "top_k_ids",
|
| 34 |
+
"type" : "MultiArray"
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"hasShapeFlexibility" : "0",
|
| 38 |
+
"isOptional" : "0",
|
| 39 |
+
"dataType" : "Float32",
|
| 40 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 1 × 64)",
|
| 41 |
+
"shortDescription" : "",
|
| 42 |
+
"shape" : "[1, 1, 1, 64]",
|
| 43 |
+
"name" : "top_k_logits",
|
| 44 |
+
"type" : "MultiArray"
|
| 45 |
+
}
|
| 46 |
+
],
|
| 47 |
+
"storagePrecision" : "Float16",
|
| 48 |
+
"modelParameters" : [
|
| 49 |
+
|
| 50 |
+
],
|
| 51 |
+
"author" : "Fluid Inference",
|
| 52 |
+
"specificationVersion" : 8,
|
| 53 |
+
"mlProgramOperationTypeHistogram" : {
|
| 54 |
+
"Ios17.reduceArgmax" : 1,
|
| 55 |
+
"Ios17.squeeze" : 1,
|
| 56 |
+
"Ios17.cast" : 6,
|
| 57 |
+
"Ios17.linear" : 3,
|
| 58 |
+
"Ios17.transpose" : 2,
|
| 59 |
+
"Ios17.add" : 1,
|
| 60 |
+
"Ios16.relu" : 1,
|
| 61 |
+
"Ios16.softmax" : 1,
|
| 62 |
+
"Ios17.gatherAlongAxis" : 1,
|
| 63 |
+
"Ios17.topk" : 1,
|
| 64 |
+
"Ios17.expandDims" : 3
|
| 65 |
+
},
|
| 66 |
+
"computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
|
| 67 |
+
"isUpdatable" : "0",
|
| 68 |
+
"stateSchema" : [
|
| 69 |
+
|
| 70 |
+
],
|
| 71 |
+
"availability" : {
|
| 72 |
+
"macOS" : "14.0",
|
| 73 |
+
"tvOS" : "17.0",
|
| 74 |
+
"visionOS" : "1.0",
|
| 75 |
+
"watchOS" : "10.0",
|
| 76 |
+
"iOS" : "17.0",
|
| 77 |
+
"macCatalyst" : "17.0"
|
| 78 |
+
},
|
| 79 |
+
"modelType" : {
|
| 80 |
+
"name" : "MLModelType_mlProgram"
|
| 81 |
+
},
|
| 82 |
+
"inputSchema" : [
|
| 83 |
+
{
|
| 84 |
+
"hasShapeFlexibility" : "0",
|
| 85 |
+
"isOptional" : "0",
|
| 86 |
+
"dataType" : "Float32",
|
| 87 |
+
"formattedType" : "MultiArray (Float32 1 × 512 × 1)",
|
| 88 |
+
"shortDescription" : "",
|
| 89 |
+
"shape" : "[1, 512, 1]",
|
| 90 |
+
"name" : "encoder_step",
|
| 91 |
+
"type" : "MultiArray"
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"hasShapeFlexibility" : "0",
|
| 95 |
+
"isOptional" : "0",
|
| 96 |
+
"dataType" : "Float32",
|
| 97 |
+
"formattedType" : "MultiArray (Float32 1 × 640 × 1)",
|
| 98 |
+
"shortDescription" : "",
|
| 99 |
+
"shape" : "[1, 640, 1]",
|
| 100 |
+
"name" : "decoder_step",
|
| 101 |
+
"type" : "MultiArray"
|
| 102 |
+
}
|
| 103 |
+
],
|
| 104 |
+
"userDefinedMetadata" : {
|
| 105 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 106 |
+
"com.github.apple.coremltools.version" : "8.3.0",
|
| 107 |
+
"com.github.apple.coremltools.source" : "torch==2.4.0"
|
| 108 |
+
},
|
| 109 |
+
"generatedClassName" : "parakeet_eou_joint_decision_single_step",
|
| 110 |
+
"method" : "predict"
|
| 111 |
+
}
|
| 112 |
+
]
|
1280ms/joint_decision.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios17>(tensor<fp32, [1, 640, 1]> decoder_step, tensor<fp32, [1, 512, 1]> encoder_step) {
|
| 5 |
+
tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 6 |
+
tensor<string, []> encoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 7 |
+
tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 8 |
+
tensor<string, []> decoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 9 |
+
tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 10 |
+
tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
|
| 11 |
+
tensor<fp16, [1, 512, 1]> encoder_step_to_fp16 = cast(dtype = encoder_step_to_fp16_dtype_0, x = encoder_step)[name = tensor<string, []>("cast_8")];
|
| 12 |
+
tensor<fp16, [1, 1, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_step_to_fp16)[name = tensor<string, []>("transpose_1")];
|
| 13 |
+
tensor<fp16, [1, 1, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
|
| 14 |
+
tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
|
| 15 |
+
tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
|
| 16 |
+
tensor<fp16, [1, 640, 1]> decoder_step_to_fp16 = cast(dtype = decoder_step_to_fp16_dtype_0, x = decoder_step)[name = tensor<string, []>("cast_7")];
|
| 17 |
+
tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_step_to_fp16)[name = tensor<string, []>("transpose_0")];
|
| 18 |
+
tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
|
| 19 |
+
tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
|
| 20 |
+
tensor<fp16, [1, 1, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
|
| 21 |
+
tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
|
| 22 |
+
tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
|
| 23 |
+
tensor<fp16, [1, 1, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
|
| 24 |
+
tensor<fp16, [1, 1, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
|
| 25 |
+
tensor<fp16, [1027, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1027, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
|
| 26 |
+
tensor<fp16, [1027]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1027]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2792064)))];
|
| 27 |
+
tensor<fp16, [1, 1, 1, 1027]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
|
| 28 |
+
tensor<int32, []> var_38_axis_0 = const()[name = tensor<string, []>("op_38_axis_0"), val = tensor<int32, []>(-1)];
|
| 29 |
+
tensor<bool, []> var_38_keep_dims_0 = const()[name = tensor<string, []>("op_38_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 30 |
+
tensor<string, []> var_38_output_dtype_0 = const()[name = tensor<string, []>("op_38_output_dtype_0"), val = tensor<string, []>("int32")];
|
| 31 |
+
tensor<int32, [1, 1, 1]> token_id = reduce_argmax(axis = var_38_axis_0, keep_dims = var_38_keep_dims_0, output_dtype = var_38_output_dtype_0, x = linear_2_cast_fp16)[name = tensor<string, []>("op_38_cast_fp16")];
|
| 32 |
+
tensor<int32, []> var_44 = const()[name = tensor<string, []>("op_44"), val = tensor<int32, []>(-1)];
|
| 33 |
+
tensor<fp16, [1, 1, 1, 1027]> token_probs_all_cast_fp16 = softmax(axis = var_44, x = linear_2_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
|
| 34 |
+
tensor<int32, [1]> var_53_axes_0 = const()[name = tensor<string, []>("op_53_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 35 |
+
tensor<int32, [1, 1, 1, 1]> var_53 = expand_dims(axes = var_53_axes_0, x = token_id)[name = tensor<string, []>("op_53")];
|
| 36 |
+
tensor<int32, []> var_54 = const()[name = tensor<string, []>("op_54"), val = tensor<int32, []>(-1)];
|
| 37 |
+
tensor<bool, []> var_56_validate_indices_0 = const()[name = tensor<string, []>("op_56_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 38 |
+
tensor<string, []> var_53_to_int16_dtype_0 = const()[name = tensor<string, []>("op_53_to_int16_dtype_0"), val = tensor<string, []>("int16")];
|
| 39 |
+
tensor<int16, [1, 1, 1, 1]> var_53_to_int16 = cast(dtype = var_53_to_int16_dtype_0, x = var_53)[name = tensor<string, []>("cast_6")];
|
| 40 |
+
tensor<fp16, [1, 1, 1, 1]> var_56_cast_fp16_cast_int16 = gather_along_axis(axis = var_54, indices = var_53_to_int16, validate_indices = var_56_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_56_cast_fp16_cast_int16")];
|
| 41 |
+
tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 42 |
+
tensor<fp16, [1, 1, 1]> var_58_cast_fp16 = squeeze(axes = var_58_axes_0, x = var_56_cast_fp16_cast_int16)[name = tensor<string, []>("op_58_cast_fp16")];
|
| 43 |
+
tensor<string, []> var_58_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_58_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 44 |
+
tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(64)];
|
| 45 |
+
tensor<int32, []> var_63_axis_0 = const()[name = tensor<string, []>("op_63_axis_0"), val = tensor<int32, []>(-1)];
|
| 46 |
+
tensor<bool, []> var_63_ascending_0 = const()[name = tensor<string, []>("op_63_ascending_0"), val = tensor<bool, []>(false)];
|
| 47 |
+
tensor<bool, []> var_63_sort_0 = const()[name = tensor<string, []>("op_63_sort_0"), val = tensor<bool, []>(true)];
|
| 48 |
+
tensor<bool, []> var_63_return_indices_0 = const()[name = tensor<string, []>("op_63_return_indices_0"), val = tensor<bool, []>(true)];
|
| 49 |
+
tensor<string, []> var_63_cast_fp16_cast_int16_output_indices_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_cast_int16_output_indices_dtype_0"), val = tensor<string, []>("uint16")];
|
| 50 |
+
tensor<fp16, [1, 1, 1, 64]> var_63_cast_fp16_cast_int16_0, tensor<uint16, [1, 1, 1, 64]> var_63_cast_fp16_cast_int16_1 = topk(ascending = var_63_ascending_0, axis = var_63_axis_0, k = var_59, output_indices_dtype = var_63_cast_fp16_cast_int16_output_indices_dtype_0, return_indices = var_63_return_indices_0, sort = var_63_sort_0, x = linear_2_cast_fp16)[name = tensor<string, []>("op_63_cast_fp16_cast_int16")];
|
| 51 |
+
tensor<string, []> var_63_cast_fp16_cast_int16_1_to_int32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_cast_int16_1_to_int32_dtype_0"), val = tensor<string, []>("int32")];
|
| 52 |
+
tensor<string, []> var_63_cast_fp16_0_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_0_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 53 |
+
tensor<fp32, [1, 1, 1, 64]> top_k_logits = cast(dtype = var_63_cast_fp16_0_to_fp32_dtype_0, x = var_63_cast_fp16_cast_int16_0)[name = tensor<string, []>("cast_3")];
|
| 54 |
+
tensor<int32, [1, 1, 1, 64]> top_k_ids = cast(dtype = var_63_cast_fp16_cast_int16_1_to_int32_dtype_0, x = var_63_cast_fp16_cast_int16_1)[name = tensor<string, []>("cast_4")];
|
| 55 |
+
tensor<fp32, [1, 1, 1]> token_prob = cast(dtype = var_58_cast_fp16_to_fp32_dtype_0, x = var_58_cast_fp16)[name = tensor<string, []>("cast_5")];
|
| 56 |
+
} -> (token_id, token_prob, top_k_ids, top_k_logits);
|
| 57 |
+
}
|
1280ms/joint_decision.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7039b2010a269153f5a96edf28637f921a86ef8822f248f2d6712f7a6bce84b4
|
| 3 |
+
size 2794182
|
1280ms/parakeet_eou_preprocessor.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4ada8b0b99ac1d2ba7acbffacfbbf1a06cb69d30e9410d237ee0aa4c2b0ad63
|
| 3 |
+
size 243
|
1280ms/parakeet_eou_preprocessor.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dc7252fa47622fe39577361233627062019a3bb740fdbb5366a7bae09df0ec5e
|
| 3 |
+
size 422
|
1280ms/parakeet_eou_preprocessor.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"shortDescription" : "Parakeet EOU preprocessor",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[]",
|
| 13 |
+
"name" : "mel",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Int32",
|
| 20 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[1]",
|
| 23 |
+
"name" : "mel_length",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
}
|
| 26 |
+
],
|
| 27 |
+
"storagePrecision" : "Float32",
|
| 28 |
+
"modelParameters" : [
|
| 29 |
+
|
| 30 |
+
],
|
| 31 |
+
"author" : "Fluid Inference",
|
| 32 |
+
"specificationVersion" : 8,
|
| 33 |
+
"mlProgramOperationTypeHistogram" : {
|
| 34 |
+
"Range1d" : 1,
|
| 35 |
+
"Ios17.reshape" : 2,
|
| 36 |
+
"Identity" : 1,
|
| 37 |
+
"Ios17.matmul" : 1,
|
| 38 |
+
"Ios17.expandDims" : 5,
|
| 39 |
+
"Select" : 1,
|
| 40 |
+
"Ios17.add" : 3,
|
| 41 |
+
"Ios17.sliceByIndex" : 3,
|
| 42 |
+
"Ios16.reduceSum" : 1,
|
| 43 |
+
"Shape" : 1,
|
| 44 |
+
"Ios17.gather" : 1,
|
| 45 |
+
"Pad" : 1,
|
| 46 |
+
"Ios17.log" : 1,
|
| 47 |
+
"Ios17.conv" : 2,
|
| 48 |
+
"Ios17.sub" : 2,
|
| 49 |
+
"Ios17.pow" : 1,
|
| 50 |
+
"Ios17.cast" : 2,
|
| 51 |
+
"Stack" : 1,
|
| 52 |
+
"Ios17.concat" : 1,
|
| 53 |
+
"Ios17.floorDiv" : 1,
|
| 54 |
+
"Ios17.greaterEqual" : 1,
|
| 55 |
+
"Ios17.mul" : 1
|
| 56 |
+
},
|
| 57 |
+
"computePrecision" : "Mixed (Float32, Int32)",
|
| 58 |
+
"isUpdatable" : "0",
|
| 59 |
+
"stateSchema" : [
|
| 60 |
+
|
| 61 |
+
],
|
| 62 |
+
"availability" : {
|
| 63 |
+
"macOS" : "14.0",
|
| 64 |
+
"tvOS" : "17.0",
|
| 65 |
+
"visionOS" : "1.0",
|
| 66 |
+
"watchOS" : "10.0",
|
| 67 |
+
"iOS" : "17.0",
|
| 68 |
+
"macCatalyst" : "17.0"
|
| 69 |
+
},
|
| 70 |
+
"modelType" : {
|
| 71 |
+
"name" : "MLModelType_mlProgram"
|
| 72 |
+
},
|
| 73 |
+
"inputSchema" : [
|
| 74 |
+
{
|
| 75 |
+
"dataType" : "Float32",
|
| 76 |
+
"hasShapeFlexibility" : "1",
|
| 77 |
+
"isOptional" : "0",
|
| 78 |
+
"shapeFlexibility" : "1 × 1...32000",
|
| 79 |
+
"shapeRange" : "[[1, 1], [1, 32000]]",
|
| 80 |
+
"formattedType" : "MultiArray (Float32 1 × 1)",
|
| 81 |
+
"type" : "MultiArray",
|
| 82 |
+
"shape" : "[1, 1]",
|
| 83 |
+
"name" : "audio_signal",
|
| 84 |
+
"shortDescription" : ""
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"hasShapeFlexibility" : "0",
|
| 88 |
+
"isOptional" : "0",
|
| 89 |
+
"dataType" : "Int32",
|
| 90 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 91 |
+
"shortDescription" : "",
|
| 92 |
+
"shape" : "[1]",
|
| 93 |
+
"name" : "audio_length",
|
| 94 |
+
"type" : "MultiArray"
|
| 95 |
+
}
|
| 96 |
+
],
|
| 97 |
+
"userDefinedMetadata" : {
|
| 98 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 99 |
+
"com.github.apple.coremltools.source" : "torch==2.4.0",
|
| 100 |
+
"com.github.apple.coremltools.version" : "8.3.0"
|
| 101 |
+
},
|
| 102 |
+
"generatedClassName" : "parakeet_eou_preprocessor",
|
| 103 |
+
"method" : "predict"
|
| 104 |
+
}
|
| 105 |
+
]
|
1280ms/parakeet_eou_preprocessor.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios17>(tensor<int32, [1]> audio_length, tensor<fp32, [1, ?]> audio_signal) [FlexibleShapeInformation = tuple<tuple<tensor<string, []>, dict<tensor<string, []>, tensor<int32, [?]>>>, tuple<tensor<string, []>, dict<tensor<string, []>, list<tensor<int32, [2]>, ?>>>>((("DefaultShapes", {{"audio_signal", [1, 1]}}), ("RangeDims", {{"audio_signal", [[1, 1], [1, 32000]]}})))] {
|
| 5 |
+
tensor<int32, []> var_9 = const()[name = tensor<string, []>("op_9"), val = tensor<int32, []>(1)];
|
| 6 |
+
tensor<int32, []> var_10 = const()[name = tensor<string, []>("op_10"), val = tensor<int32, []>(160)];
|
| 7 |
+
tensor<int32, []> var_32 = const()[name = tensor<string, []>("op_32"), val = tensor<int32, []>(512)];
|
| 8 |
+
tensor<int32, [1]> var_33 = add(x = audio_length, y = var_32)[name = tensor<string, []>("op_33")];
|
| 9 |
+
tensor<int32, []> var_34 = const()[name = tensor<string, []>("op_34"), val = tensor<int32, []>(512)];
|
| 10 |
+
tensor<int32, [1]> var_35 = sub(x = var_33, y = var_34)[name = tensor<string, []>("op_35")];
|
| 11 |
+
tensor<int32, [1]> floor_div_0 = floor_div(x = var_35, y = var_10)[name = tensor<string, []>("floor_div_0")];
|
| 12 |
+
tensor<string, []> var_36_dtype_0 = const()[name = tensor<string, []>("op_36_dtype_0"), val = tensor<string, []>("fp32")];
|
| 13 |
+
tensor<fp32, []> var_37_promoted = const()[name = tensor<string, []>("op_37_promoted"), val = tensor<fp32, []>(0x1p+0)];
|
| 14 |
+
tensor<fp32, [1]> var_36 = cast(dtype = var_36_dtype_0, x = floor_div_0)[name = tensor<string, []>("cast_11")];
|
| 15 |
+
tensor<fp32, [1]> seq_len_1 = add(x = var_36, y = var_37_promoted)[name = tensor<string, []>("seq_len_1")];
|
| 16 |
+
tensor<string, []> cast_2_dtype_0 = const()[name = tensor<string, []>("cast_2_dtype_0"), val = tensor<string, []>("int32")];
|
| 17 |
+
tensor<int32, [2]> var_41_begin_0 = const()[name = tensor<string, []>("op_41_begin_0"), val = tensor<int32, [2]>([0, 0])];
|
| 18 |
+
tensor<int32, [2]> var_41_end_0 = const()[name = tensor<string, []>("op_41_end_0"), val = tensor<int32, [2]>([1, 1])];
|
| 19 |
+
tensor<bool, [2]> var_41_end_mask_0 = const()[name = tensor<string, []>("op_41_end_mask_0"), val = tensor<bool, [2]>([true, false])];
|
| 20 |
+
tensor<bool, [2]> var_41_squeeze_mask_0 = const()[name = tensor<string, []>("op_41_squeeze_mask_0"), val = tensor<bool, [2]>([false, true])];
|
| 21 |
+
tensor<fp32, [1]> var_41 = slice_by_index(begin = var_41_begin_0, end = var_41_end_0, end_mask = var_41_end_mask_0, squeeze_mask = var_41_squeeze_mask_0, x = audio_signal)[name = tensor<string, []>("op_41")];
|
| 22 |
+
tensor<int32, [1]> var_42_axes_0 = const()[name = tensor<string, []>("op_42_axes_0"), val = tensor<int32, [1]>([1])];
|
| 23 |
+
tensor<fp32, [1, 1]> var_42 = expand_dims(axes = var_42_axes_0, x = var_41)[name = tensor<string, []>("op_42")];
|
| 24 |
+
tensor<int32, [2]> var_44_begin_0 = const()[name = tensor<string, []>("op_44_begin_0"), val = tensor<int32, [2]>([0, 1])];
|
| 25 |
+
tensor<int32, [2]> var_44_end_0 = const()[name = tensor<string, []>("op_44_end_0"), val = tensor<int32, [2]>([1, 0])];
|
| 26 |
+
tensor<bool, [2]> var_44_end_mask_0 = const()[name = tensor<string, []>("op_44_end_mask_0"), val = tensor<bool, [2]>([true, true])];
|
| 27 |
+
tensor<fp32, [1, ?]> var_44 = slice_by_index(begin = var_44_begin_0, end = var_44_end_0, end_mask = var_44_end_mask_0, x = audio_signal)[name = tensor<string, []>("op_44")];
|
| 28 |
+
tensor<int32, [2]> var_46_begin_0 = const()[name = tensor<string, []>("op_46_begin_0"), val = tensor<int32, [2]>([0, 0])];
|
| 29 |
+
tensor<int32, [2]> var_46_end_0 = const()[name = tensor<string, []>("op_46_end_0"), val = tensor<int32, [2]>([1, -1])];
|
| 30 |
+
tensor<bool, [2]> var_46_end_mask_0 = const()[name = tensor<string, []>("op_46_end_mask_0"), val = tensor<bool, [2]>([true, false])];
|
| 31 |
+
tensor<fp32, [1, ?]> var_46 = slice_by_index(begin = var_46_begin_0, end = var_46_end_0, end_mask = var_46_end_mask_0, x = audio_signal)[name = tensor<string, []>("op_46")];
|
| 32 |
+
tensor<fp32, []> var_47 = const()[name = tensor<string, []>("op_47"), val = tensor<fp32, []>(0x1.f0a3d8p-1)];
|
| 33 |
+
tensor<fp32, [1, ?]> var_48 = mul(x = var_46, y = var_47)[name = tensor<string, []>("op_48")];
|
| 34 |
+
tensor<fp32, [1, ?]> var_49 = sub(x = var_44, y = var_48)[name = tensor<string, []>("op_49")];
|
| 35 |
+
tensor<bool, []> input_1_interleave_0 = const()[name = tensor<string, []>("input_1_interleave_0"), val = tensor<bool, []>(false)];
|
| 36 |
+
tensor<fp32, [1, ?]> input_1 = concat(axis = var_9, interleave = input_1_interleave_0, values = (var_42, var_49))[name = tensor<string, []>("input_1")];
|
| 37 |
+
tensor<int32, [3]> concat_0x = const()[name = tensor<string, []>("concat_0x"), val = tensor<int32, [3]>([1, 1, -1])];
|
| 38 |
+
tensor<fp32, [1, 1, ?]> input_3 = reshape(shape = concat_0x, x = input_1)[name = tensor<string, []>("input_3")];
|
| 39 |
+
tensor<fp32, []> const_1 = const()[name = tensor<string, []>("const_1"), val = tensor<fp32, []>(0x0p+0)];
|
| 40 |
+
tensor<int32, [6]> input_5_pad_0 = const()[name = tensor<string, []>("input_5_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 256, 256])];
|
| 41 |
+
tensor<string, []> input_5_mode_0 = const()[name = tensor<string, []>("input_5_mode_0"), val = tensor<string, []>("reflect")];
|
| 42 |
+
tensor<fp32, [1, 1, ?]> input_5 = pad(constant_val = const_1, mode = input_5_mode_0, pad = input_5_pad_0, x = input_3)[name = tensor<string, []>("input_5")];
|
| 43 |
+
tensor<int32, [2]> concat_1x = const()[name = tensor<string, []>("concat_1x"), val = tensor<int32, [2]>([1, -1])];
|
| 44 |
+
tensor<fp32, [1, ?]> input = reshape(shape = concat_1x, x = input_5)[name = tensor<string, []>("input")];
|
| 45 |
+
tensor<fp32, [257, 1, 512]> expand_dims_1 = const()[name = tensor<string, []>("expand_dims_1"), val = tensor<fp32, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 46 |
+
tensor<fp32, [257, 1, 512]> expand_dims_2 = const()[name = tensor<string, []>("expand_dims_2"), val = tensor<fp32, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(526464)))];
|
| 47 |
+
tensor<int32, [1]> expand_dims_3 = const()[name = tensor<string, []>("expand_dims_3"), val = tensor<int32, [1]>([160])];
|
| 48 |
+
tensor<int32, [1]> expand_dims_4_axes_0 = const()[name = tensor<string, []>("expand_dims_4_axes_0"), val = tensor<int32, [1]>([1])];
|
| 49 |
+
tensor<fp32, [1, 1, ?]> expand_dims_4 = expand_dims(axes = expand_dims_4_axes_0, x = input)[name = tensor<string, []>("expand_dims_4")];
|
| 50 |
+
tensor<string, []> conv_0_pad_type_0 = const()[name = tensor<string, []>("conv_0_pad_type_0"), val = tensor<string, []>("valid")];
|
| 51 |
+
tensor<int32, [2]> conv_0_pad_0 = const()[name = tensor<string, []>("conv_0_pad_0"), val = tensor<int32, [2]>([0, 0])];
|
| 52 |
+
tensor<int32, [1]> conv_0_dilations_0 = const()[name = tensor<string, []>("conv_0_dilations_0"), val = tensor<int32, [1]>([1])];
|
| 53 |
+
tensor<int32, []> conv_0_groups_0 = const()[name = tensor<string, []>("conv_0_groups_0"), val = tensor<int32, []>(1)];
|
| 54 |
+
tensor<fp32, [1, 257, ?]> conv_0 = conv(dilations = conv_0_dilations_0, groups = conv_0_groups_0, pad = conv_0_pad_0, pad_type = conv_0_pad_type_0, strides = expand_dims_3, weight = expand_dims_1, x = expand_dims_4)[name = tensor<string, []>("conv_0")];
|
| 55 |
+
tensor<string, []> conv_1_pad_type_0 = const()[name = tensor<string, []>("conv_1_pad_type_0"), val = tensor<string, []>("valid")];
|
| 56 |
+
tensor<int32, [2]> conv_1_pad_0 = const()[name = tensor<string, []>("conv_1_pad_0"), val = tensor<int32, [2]>([0, 0])];
|
| 57 |
+
tensor<int32, [1]> conv_1_dilations_0 = const()[name = tensor<string, []>("conv_1_dilations_0"), val = tensor<int32, [1]>([1])];
|
| 58 |
+
tensor<int32, []> conv_1_groups_0 = const()[name = tensor<string, []>("conv_1_groups_0"), val = tensor<int32, []>(1)];
|
| 59 |
+
tensor<fp32, [1, 257, ?]> conv_1 = conv(dilations = conv_1_dilations_0, groups = conv_1_groups_0, pad = conv_1_pad_0, pad_type = conv_1_pad_type_0, strides = expand_dims_3, weight = expand_dims_2, x = expand_dims_4)[name = tensor<string, []>("conv_1")];
|
| 60 |
+
tensor<int32, []> stack_0_axis_0 = const()[name = tensor<string, []>("stack_0_axis_0"), val = tensor<int32, []>(-1)];
|
| 61 |
+
tensor<fp32, [1, 257, ?, 2]> stack_0 = stack(axis = stack_0_axis_0, values = (conv_0, conv_1))[name = tensor<string, []>("stack_0")];
|
| 62 |
+
tensor<fp32, []> var_17_promoted = const()[name = tensor<string, []>("op_17_promoted"), val = tensor<fp32, []>(0x1p+1)];
|
| 63 |
+
tensor<fp32, [1, 257, ?, 2]> var_65 = pow(x = stack_0, y = var_17_promoted)[name = tensor<string, []>("op_65")];
|
| 64 |
+
tensor<int32, [1]> var_67_axes_0 = const()[name = tensor<string, []>("op_67_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 65 |
+
tensor<bool, []> var_67_keep_dims_0 = const()[name = tensor<string, []>("op_67_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 66 |
+
tensor<fp32, [1, 257, ?]> var_67 = reduce_sum(axes = var_67_axes_0, keep_dims = var_67_keep_dims_0, x = var_65)[name = tensor<string, []>("op_67")];
|
| 67 |
+
tensor<fp32, [1, 257, ?]> x_9 = identity(x = var_67)[name = tensor<string, []>("x_9")];
|
| 68 |
+
tensor<fp32, [1, 128, 257]> const_2 = const()[name = tensor<string, []>("const_2"), val = tensor<fp32, [1, 128, 257]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1052864)))];
|
| 69 |
+
tensor<bool, []> x_11_transpose_x_0 = const()[name = tensor<string, []>("x_11_transpose_x_0"), val = tensor<bool, []>(false)];
|
| 70 |
+
tensor<bool, []> x_11_transpose_y_0 = const()[name = tensor<string, []>("x_11_transpose_y_0"), val = tensor<bool, []>(false)];
|
| 71 |
+
tensor<fp32, [1, 128, ?]> x_11 = matmul(transpose_x = x_11_transpose_x_0, transpose_y = x_11_transpose_y_0, x = const_2, y = x_9)[name = tensor<string, []>("x_11")];
|
| 72 |
+
tensor<fp32, []> var_74 = const()[name = tensor<string, []>("op_74"), val = tensor<fp32, []>(0x1p-24)];
|
| 73 |
+
tensor<fp32, [1, 128, ?]> var_75 = add(x = x_11, y = var_74)[name = tensor<string, []>("op_75")];
|
| 74 |
+
tensor<fp32, []> x_epsilon_0 = const()[name = tensor<string, []>("x_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
|
| 75 |
+
tensor<fp32, [1, 128, ?]> x = log(epsilon = x_epsilon_0, x = var_75)[name = tensor<string, []>("x")];
|
| 76 |
+
tensor<int32, [3]> var_77_shape = shape(x = x)[name = tensor<string, []>("op_77_shape")];
|
| 77 |
+
tensor<int32, []> select_4 = const()[name = tensor<string, []>("select_4"), val = tensor<int32, []>(2)];
|
| 78 |
+
tensor<int32, []> gather_4_axis_0 = const()[name = tensor<string, []>("gather_4_axis_0"), val = tensor<int32, []>(0)];
|
| 79 |
+
tensor<int32, []> gather_4_batch_dims_0 = const()[name = tensor<string, []>("gather_4_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 80 |
+
tensor<bool, []> gather_4_validate_indices_0 = const()[name = tensor<string, []>("gather_4_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 81 |
+
tensor<int32, []> gather_4 = gather(axis = gather_4_axis_0, batch_dims = gather_4_batch_dims_0, indices = select_4, validate_indices = gather_4_validate_indices_0, x = var_77_shape)[name = tensor<string, []>("gather_4")];
|
| 82 |
+
tensor<int32, []> const_3 = const()[name = tensor<string, []>("const_3"), val = tensor<int32, []>(0)];
|
| 83 |
+
tensor<int32, []> const_4 = const()[name = tensor<string, []>("const_4"), val = tensor<int32, []>(1)];
|
| 84 |
+
tensor<int32, [?]> mask_1 = range_1d(end = gather_4, start = const_3, step = const_4)[name = tensor<string, []>("mask_1")];
|
| 85 |
+
tensor<int32, [1]> expand_dims_0_axes_0 = const()[name = tensor<string, []>("expand_dims_0_axes_0"), val = tensor<int32, [1]>([0])];
|
| 86 |
+
tensor<int32, [1, ?]> expand_dims_0 = expand_dims(axes = expand_dims_0_axes_0, x = mask_1)[name = tensor<string, []>("expand_dims_0")];
|
| 87 |
+
tensor<int32, [1]> var_82_axes_0 = const()[name = tensor<string, []>("op_82_axes_0"), val = tensor<int32, [1]>([1])];
|
| 88 |
+
tensor<int32, [1]> mel_length = cast(dtype = cast_2_dtype_0, x = seq_len_1)[name = tensor<string, []>("cast_10")];
|
| 89 |
+
tensor<int32, [1, 1]> var_82 = expand_dims(axes = var_82_axes_0, x = mel_length)[name = tensor<string, []>("op_82")];
|
| 90 |
+
tensor<bool, [1, ?]> mask = greater_equal(x = expand_dims_0, y = var_82)[name = tensor<string, []>("mask")];
|
| 91 |
+
tensor<int32, [1]> var_84_axes_0 = const()[name = tensor<string, []>("op_84_axes_0"), val = tensor<int32, [1]>([1])];
|
| 92 |
+
tensor<bool, [1, 1, ?]> var_84 = expand_dims(axes = var_84_axes_0, x = mask)[name = tensor<string, []>("op_84")];
|
| 93 |
+
tensor<fp32, []> cast_7 = const()[name = tensor<string, []>("cast_7"), val = tensor<fp32, []>(0x0p+0)];
|
| 94 |
+
tensor<fp32, [1, 128, ?]> mel = select(a = cast_7, b = x, cond = var_84)[name = tensor<string, []>("processed_signal")];
|
| 95 |
+
} -> (mel, mel_length);
|
| 96 |
+
}
|
1280ms/parakeet_eou_preprocessor.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:009bba4fde82dc55db9b55d77cf3ba5f791ce366c49f079285fe25a3b6e2291d
|
| 3 |
+
size 1184512
|
1280ms/streaming_encoder.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0a3c84022a9d2dc769d38cf8f45e93423e20734d092e3c16db11fbf6dca4004
|
| 3 |
+
size 243
|
1280ms/streaming_encoder.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:41ce3f96c3d6b3333796fc4ed82cb0c9b4ea99396b88f8eec3ba24394ba2bb78
|
| 3 |
+
size 671
|
1280ms/streaming_encoder.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"storagePrecision" : "Float16",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32 1 × 512 × 17)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 512, 17]",
|
| 13 |
+
"name" : "encoded_output",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Int32",
|
| 20 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[1]",
|
| 23 |
+
"name" : "encoded_length",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"hasShapeFlexibility" : "0",
|
| 28 |
+
"isOptional" : "0",
|
| 29 |
+
"dataType" : "Float32",
|
| 30 |
+
"formattedType" : "MultiArray (Float32 1 × 128 × 16)",
|
| 31 |
+
"shortDescription" : "",
|
| 32 |
+
"shape" : "[1, 128, 16]",
|
| 33 |
+
"name" : "new_pre_cache",
|
| 34 |
+
"type" : "MultiArray"
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"hasShapeFlexibility" : "0",
|
| 38 |
+
"isOptional" : "0",
|
| 39 |
+
"dataType" : "Float32",
|
| 40 |
+
"formattedType" : "MultiArray (Float32 17 × 1 × 70 × 512)",
|
| 41 |
+
"shortDescription" : "",
|
| 42 |
+
"shape" : "[17, 1, 70, 512]",
|
| 43 |
+
"name" : "new_cache_last_channel",
|
| 44 |
+
"type" : "MultiArray"
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"hasShapeFlexibility" : "0",
|
| 48 |
+
"isOptional" : "0",
|
| 49 |
+
"dataType" : "Float32",
|
| 50 |
+
"formattedType" : "MultiArray (Float32 17 × 1 × 512 × 8)",
|
| 51 |
+
"shortDescription" : "",
|
| 52 |
+
"shape" : "[17, 1, 512, 8]",
|
| 53 |
+
"name" : "new_cache_last_time",
|
| 54 |
+
"type" : "MultiArray"
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"hasShapeFlexibility" : "0",
|
| 58 |
+
"isOptional" : "0",
|
| 59 |
+
"dataType" : "Int32",
|
| 60 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 61 |
+
"shortDescription" : "",
|
| 62 |
+
"shape" : "[1]",
|
| 63 |
+
"name" : "new_cache_last_channel_len",
|
| 64 |
+
"type" : "MultiArray"
|
| 65 |
+
}
|
| 66 |
+
],
|
| 67 |
+
"modelParameters" : [
|
| 68 |
+
|
| 69 |
+
],
|
| 70 |
+
"specificationVersion" : 8,
|
| 71 |
+
"mlProgramOperationTypeHistogram" : {
|
| 72 |
+
"Ios17.floor" : 3,
|
| 73 |
+
"Ios17.logicalAnd" : 3,
|
| 74 |
+
"Ios17.reshape" : 103,
|
| 75 |
+
"Ios16.softmax" : 17,
|
| 76 |
+
"Ios17.matmul" : 51,
|
| 77 |
+
"Ios17.transpose" : 157,
|
| 78 |
+
"Split" : 17,
|
| 79 |
+
"Ios17.expandDims" : 6,
|
| 80 |
+
"Select" : 51,
|
| 81 |
+
"Ios17.add" : 126,
|
| 82 |
+
"Tile" : 1,
|
| 83 |
+
"Ios17.sliceByIndex" : 106,
|
| 84 |
+
"Ios16.sigmoid" : 17,
|
| 85 |
+
"Pad" : 20,
|
| 86 |
+
"Ios17.logicalNot" : 2,
|
| 87 |
+
"Ios17.layerNorm" : 102,
|
| 88 |
+
"Ios17.less" : 1,
|
| 89 |
+
"Ios17.sub" : 1,
|
| 90 |
+
"Ios17.conv" : 56,
|
| 91 |
+
"Ios17.clip" : 2,
|
| 92 |
+
"Ios16.relu" : 3,
|
| 93 |
+
"Ios17.linear" : 137,
|
| 94 |
+
"Ios17.concat" : 52,
|
| 95 |
+
"Ios17.greaterEqual" : 1,
|
| 96 |
+
"Ios17.cast" : 14,
|
| 97 |
+
"Ios16.silu" : 51,
|
| 98 |
+
"Stack" : 2,
|
| 99 |
+
"Ios17.mul" : 72
|
| 100 |
+
},
|
| 101 |
+
"computePrecision" : "Mixed (Float16, Float32, Int32)",
|
| 102 |
+
"isUpdatable" : "0",
|
| 103 |
+
"stateSchema" : [
|
| 104 |
+
|
| 105 |
+
],
|
| 106 |
+
"availability" : {
|
| 107 |
+
"macOS" : "14.0",
|
| 108 |
+
"tvOS" : "17.0",
|
| 109 |
+
"visionOS" : "1.0",
|
| 110 |
+
"watchOS" : "10.0",
|
| 111 |
+
"iOS" : "17.0",
|
| 112 |
+
"macCatalyst" : "17.0"
|
| 113 |
+
},
|
| 114 |
+
"modelType" : {
|
| 115 |
+
"name" : "MLModelType_mlProgram"
|
| 116 |
+
},
|
| 117 |
+
"userDefinedMetadata" : {
|
| 118 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 119 |
+
"com.github.apple.coremltools.source" : "torch==2.4.0",
|
| 120 |
+
"com.github.apple.coremltools.version" : "8.3.0"
|
| 121 |
+
},
|
| 122 |
+
"inputSchema" : [
|
| 123 |
+
{
|
| 124 |
+
"hasShapeFlexibility" : "0",
|
| 125 |
+
"isOptional" : "0",
|
| 126 |
+
"dataType" : "Float32",
|
| 127 |
+
"formattedType" : "MultiArray (Float32 1 × 128 × 129)",
|
| 128 |
+
"shortDescription" : "",
|
| 129 |
+
"shape" : "[1, 128, 129]",
|
| 130 |
+
"name" : "audio_signal",
|
| 131 |
+
"type" : "MultiArray"
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"hasShapeFlexibility" : "0",
|
| 135 |
+
"isOptional" : "0",
|
| 136 |
+
"dataType" : "Int32",
|
| 137 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 138 |
+
"shortDescription" : "",
|
| 139 |
+
"shape" : "[1]",
|
| 140 |
+
"name" : "audio_length",
|
| 141 |
+
"type" : "MultiArray"
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"hasShapeFlexibility" : "0",
|
| 145 |
+
"isOptional" : "0",
|
| 146 |
+
"dataType" : "Float32",
|
| 147 |
+
"formattedType" : "MultiArray (Float32 1 × 128 × 16)",
|
| 148 |
+
"shortDescription" : "",
|
| 149 |
+
"shape" : "[1, 128, 16]",
|
| 150 |
+
"name" : "pre_cache",
|
| 151 |
+
"type" : "MultiArray"
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"hasShapeFlexibility" : "0",
|
| 155 |
+
"isOptional" : "0",
|
| 156 |
+
"dataType" : "Float32",
|
| 157 |
+
"formattedType" : "MultiArray (Float32 17 × 1 × 70 × 512)",
|
| 158 |
+
"shortDescription" : "",
|
| 159 |
+
"shape" : "[17, 1, 70, 512]",
|
| 160 |
+
"name" : "cache_last_channel",
|
| 161 |
+
"type" : "MultiArray"
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"hasShapeFlexibility" : "0",
|
| 165 |
+
"isOptional" : "0",
|
| 166 |
+
"dataType" : "Float32",
|
| 167 |
+
"formattedType" : "MultiArray (Float32 17 × 1 × 512 × 8)",
|
| 168 |
+
"shortDescription" : "",
|
| 169 |
+
"shape" : "[17, 1, 512, 8]",
|
| 170 |
+
"name" : "cache_last_time",
|
| 171 |
+
"type" : "MultiArray"
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"hasShapeFlexibility" : "0",
|
| 175 |
+
"isOptional" : "0",
|
| 176 |
+
"dataType" : "Int32",
|
| 177 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 178 |
+
"shortDescription" : "",
|
| 179 |
+
"shape" : "[1]",
|
| 180 |
+
"name" : "cache_last_channel_len",
|
| 181 |
+
"type" : "MultiArray"
|
| 182 |
+
}
|
| 183 |
+
],
|
| 184 |
+
"generatedClassName" : "streaming_encoder",
|
| 185 |
+
"method" : "predict"
|
| 186 |
+
}
|
| 187 |
+
]
|
1280ms/streaming_encoder.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
1280ms/streaming_encoder.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c71acb590ceb2af449de5c7e3516e76057eaf4589d1f16edba774831db74b17
|
| 3 |
+
size 213179200
|
1280ms/streaming_encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9181d223091aa592cb656d49346e640a38ec2426de5ec2d06edbc14e92b8968b
|
| 3 |
+
size 508252
|
1280ms/streaming_encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c71acb590ceb2af449de5c7e3516e76057eaf4589d1f16edba774831db74b17
|
| 3 |
+
size 213179200
|
1280ms/streaming_encoder.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"468B5E19-6BA9-478C-8D2A-23953ACBD5E3": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Specification",
|
| 7 |
+
"name": "model.mlmodel",
|
| 8 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 9 |
+
},
|
| 10 |
+
"F22DE286-FE1A-4BBD-A7ED-B0130595DAF3": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Weights",
|
| 13 |
+
"name": "weights",
|
| 14 |
+
"path": "com.apple.CoreML/weights"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "468B5E19-6BA9-478C-8D2A-23953ACBD5E3"
|
| 18 |
+
}
|
1280ms/vocab.json
ADDED
|
@@ -0,0 +1,1028 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"0": "<unk>",
|
| 3 |
+
"1": "▁t",
|
| 4 |
+
"2": "▁th",
|
| 5 |
+
"3": "▁a",
|
| 6 |
+
"4": "▁i",
|
| 7 |
+
"5": "▁the",
|
| 8 |
+
"6": "▁s",
|
| 9 |
+
"7": "re",
|
| 10 |
+
"8": "▁w",
|
| 11 |
+
"9": "▁o",
|
| 12 |
+
"10": "in",
|
| 13 |
+
"11": "at",
|
| 14 |
+
"12": "er",
|
| 15 |
+
"13": "nd",
|
| 16 |
+
"14": "ou",
|
| 17 |
+
"15": "▁c",
|
| 18 |
+
"16": "▁b",
|
| 19 |
+
"17": "▁h",
|
| 20 |
+
"18": "en",
|
| 21 |
+
"19": "on",
|
| 22 |
+
"20": "▁m",
|
| 23 |
+
"21": "▁f",
|
| 24 |
+
"22": "ing",
|
| 25 |
+
"23": "▁p",
|
| 26 |
+
"24": "▁to",
|
| 27 |
+
"25": "▁and",
|
| 28 |
+
"26": "▁d",
|
| 29 |
+
"27": "an",
|
| 30 |
+
"28": "or",
|
| 31 |
+
"29": "es",
|
| 32 |
+
"30": "▁y",
|
| 33 |
+
"31": "▁l",
|
| 34 |
+
"32": "▁of",
|
| 35 |
+
"33": "ll",
|
| 36 |
+
"34": "▁in",
|
| 37 |
+
"35": "ed",
|
| 38 |
+
"36": "it",
|
| 39 |
+
"37": "▁g",
|
| 40 |
+
"38": "is",
|
| 41 |
+
"39": "▁you",
|
| 42 |
+
"40": "▁n",
|
| 43 |
+
"41": "ar",
|
| 44 |
+
"42": "om",
|
| 45 |
+
"43": "as",
|
| 46 |
+
"44": "ve",
|
| 47 |
+
"45": "▁e",
|
| 48 |
+
"46": "ic",
|
| 49 |
+
"47": "▁it",
|
| 50 |
+
"48": "al",
|
| 51 |
+
"49": "us",
|
| 52 |
+
"50": "▁wh",
|
| 53 |
+
"51": "▁we",
|
| 54 |
+
"52": "▁be",
|
| 55 |
+
"53": "ion",
|
| 56 |
+
"54": "ow",
|
| 57 |
+
"55": "le",
|
| 58 |
+
"56": "▁is",
|
| 59 |
+
"57": "et",
|
| 60 |
+
"58": "ent",
|
| 61 |
+
"59": "ot",
|
| 62 |
+
"60": "ut",
|
| 63 |
+
"61": "▁re",
|
| 64 |
+
"62": "▁on",
|
| 65 |
+
"63": "ay",
|
| 66 |
+
"64": "▁ha",
|
| 67 |
+
"65": "ig",
|
| 68 |
+
"66": "▁so",
|
| 69 |
+
"67": "ct",
|
| 70 |
+
"68": "▁he",
|
| 71 |
+
"69": "▁for",
|
| 72 |
+
"70": "ver",
|
| 73 |
+
"71": "ke",
|
| 74 |
+
"72": "ro",
|
| 75 |
+
"73": "▁st",
|
| 76 |
+
"74": "id",
|
| 77 |
+
"75": "▁go",
|
| 78 |
+
"76": "all",
|
| 79 |
+
"77": "se",
|
| 80 |
+
"78": "ly",
|
| 81 |
+
"79": "▁u",
|
| 82 |
+
"80": "ch",
|
| 83 |
+
"81": "st",
|
| 84 |
+
"82": "ld",
|
| 85 |
+
"83": "▁k",
|
| 86 |
+
"84": "ce",
|
| 87 |
+
"85": "ur",
|
| 88 |
+
"86": "▁li",
|
| 89 |
+
"87": "am",
|
| 90 |
+
"88": "▁r",
|
| 91 |
+
"89": "ht",
|
| 92 |
+
"90": "▁j",
|
| 93 |
+
"91": "ith",
|
| 94 |
+
"92": "▁se",
|
| 95 |
+
"93": "ir",
|
| 96 |
+
"94": "▁as",
|
| 97 |
+
"95": "▁an",
|
| 98 |
+
"96": "im",
|
| 99 |
+
"97": "▁do",
|
| 100 |
+
"98": "ad",
|
| 101 |
+
"99": "▁was",
|
| 102 |
+
"100": "ight",
|
| 103 |
+
"101": "th",
|
| 104 |
+
"102": "▁are",
|
| 105 |
+
"103": "▁but",
|
| 106 |
+
"104": "▁sh",
|
| 107 |
+
"105": "ust",
|
| 108 |
+
"106": "ally",
|
| 109 |
+
"107": "▁not",
|
| 110 |
+
"108": "▁or",
|
| 111 |
+
"109": "▁com",
|
| 112 |
+
"110": "▁can",
|
| 113 |
+
"111": "▁me",
|
| 114 |
+
"112": "op",
|
| 115 |
+
"113": "▁mo",
|
| 116 |
+
"114": "▁at",
|
| 117 |
+
"115": "ill",
|
| 118 |
+
"116": "▁ch",
|
| 119 |
+
"117": "▁ne",
|
| 120 |
+
"118": "ant",
|
| 121 |
+
"119": "▁de",
|
| 122 |
+
"120": "▁kn",
|
| 123 |
+
"121": "▁one",
|
| 124 |
+
"122": "il",
|
| 125 |
+
"123": "ol",
|
| 126 |
+
"124": "▁con",
|
| 127 |
+
"125": "ter",
|
| 128 |
+
"126": "▁ab",
|
| 129 |
+
"127": "▁fr",
|
| 130 |
+
"128": "ere",
|
| 131 |
+
"129": "ck",
|
| 132 |
+
"130": "▁al",
|
| 133 |
+
"131": "▁all",
|
| 134 |
+
"132": "qu",
|
| 135 |
+
"133": "▁pro",
|
| 136 |
+
"134": "▁som",
|
| 137 |
+
"135": "ould",
|
| 138 |
+
"136": "▁tw",
|
| 139 |
+
"137": "ul",
|
| 140 |
+
"138": "ra",
|
| 141 |
+
"139": "od",
|
| 142 |
+
"140": "ers",
|
| 143 |
+
"141": "▁su",
|
| 144 |
+
"142": "ive",
|
| 145 |
+
"143": "▁v",
|
| 146 |
+
"144": "use",
|
| 147 |
+
"145": "ate",
|
| 148 |
+
"146": "ge",
|
| 149 |
+
"147": "if",
|
| 150 |
+
"148": "▁ex",
|
| 151 |
+
"149": "ess",
|
| 152 |
+
"150": "pp",
|
| 153 |
+
"151": "▁lo",
|
| 154 |
+
"152": "out",
|
| 155 |
+
"153": "▁if",
|
| 156 |
+
"154": "est",
|
| 157 |
+
"155": "ain",
|
| 158 |
+
"156": "ist",
|
| 159 |
+
"157": "and",
|
| 160 |
+
"158": "ea",
|
| 161 |
+
"159": "very",
|
| 162 |
+
"160": "art",
|
| 163 |
+
"161": "▁wor",
|
| 164 |
+
"162": "▁my",
|
| 165 |
+
"163": "ab",
|
| 166 |
+
"164": "ment",
|
| 167 |
+
"165": "▁bec",
|
| 168 |
+
"166": "un",
|
| 169 |
+
"167": "ity",
|
| 170 |
+
"168": "ri",
|
| 171 |
+
"169": "pe",
|
| 172 |
+
"170": "ions",
|
| 173 |
+
"171": "▁by",
|
| 174 |
+
"172": "ok",
|
| 175 |
+
"173": "our",
|
| 176 |
+
"174": "ort",
|
| 177 |
+
"175": "ind",
|
| 178 |
+
"176": "ink",
|
| 179 |
+
"177": "nt",
|
| 180 |
+
"178": "▁up",
|
| 181 |
+
"179": "um",
|
| 182 |
+
"180": "▁don",
|
| 183 |
+
"181": "▁get",
|
| 184 |
+
"182": "red",
|
| 185 |
+
"183": "▁out",
|
| 186 |
+
"184": "el",
|
| 187 |
+
"185": "ause",
|
| 188 |
+
"186": "res",
|
| 189 |
+
"187": "▁ma",
|
| 190 |
+
"188": "ich",
|
| 191 |
+
"189": "▁us",
|
| 192 |
+
"190": "rou",
|
| 193 |
+
"191": "▁int",
|
| 194 |
+
"192": "em",
|
| 195 |
+
"193": "os",
|
| 196 |
+
"194": "ies",
|
| 197 |
+
"195": "ie",
|
| 198 |
+
"196": "▁pl",
|
| 199 |
+
"197": "▁tr",
|
| 200 |
+
"198": "ven",
|
| 201 |
+
"199": "ous",
|
| 202 |
+
"200": "▁le",
|
| 203 |
+
"201": "▁two",
|
| 204 |
+
"202": "ard",
|
| 205 |
+
"203": "ine",
|
| 206 |
+
"204": "▁co",
|
| 207 |
+
"205": "een",
|
| 208 |
+
"206": "▁now",
|
| 209 |
+
"207": "ty",
|
| 210 |
+
"208": "her",
|
| 211 |
+
"209": "ack",
|
| 212 |
+
"210": "▁pe",
|
| 213 |
+
"211": "ame",
|
| 214 |
+
"212": "▁how",
|
| 215 |
+
"213": "▁who",
|
| 216 |
+
"214": "▁see",
|
| 217 |
+
"215": "▁tim",
|
| 218 |
+
"216": "ect",
|
| 219 |
+
"217": "ast",
|
| 220 |
+
"218": "▁our",
|
| 221 |
+
"219": "ci",
|
| 222 |
+
"220": "ree",
|
| 223 |
+
"221": "ople",
|
| 224 |
+
"222": "gh",
|
| 225 |
+
"223": "▁no",
|
| 226 |
+
"224": "▁had",
|
| 227 |
+
"225": "▁man",
|
| 228 |
+
"226": "▁qu",
|
| 229 |
+
"227": "▁en",
|
| 230 |
+
"228": "ide",
|
| 231 |
+
"229": "ure",
|
| 232 |
+
"230": "ud",
|
| 233 |
+
"231": "so",
|
| 234 |
+
"232": "▁his",
|
| 235 |
+
"233": "▁sa",
|
| 236 |
+
"234": "▁sp",
|
| 237 |
+
"235": "▁say",
|
| 238 |
+
"236": "ose",
|
| 239 |
+
"237": "ther",
|
| 240 |
+
"238": "▁act",
|
| 241 |
+
"239": "▁ta",
|
| 242 |
+
"240": "▁cl",
|
| 243 |
+
"241": "ings",
|
| 244 |
+
"242": "pt",
|
| 245 |
+
"243": "king",
|
| 246 |
+
"244": "▁any",
|
| 247 |
+
"245": "▁has",
|
| 248 |
+
"246": "▁un",
|
| 249 |
+
"247": "iv",
|
| 250 |
+
"248": "▁im",
|
| 251 |
+
"249": "▁ag",
|
| 252 |
+
"250": "▁te",
|
| 253 |
+
"251": "▁fe",
|
| 254 |
+
"252": "one",
|
| 255 |
+
"253": "per",
|
| 256 |
+
"254": "ong",
|
| 257 |
+
"255": "▁po",
|
| 258 |
+
"256": "▁ad",
|
| 259 |
+
"257": "ff",
|
| 260 |
+
"258": "ore",
|
| 261 |
+
"259": "itt",
|
| 262 |
+
"260": "ans",
|
| 263 |
+
"261": "iz",
|
| 264 |
+
"262": "eah",
|
| 265 |
+
"263": "reat",
|
| 266 |
+
"264": "act",
|
| 267 |
+
"265": "own",
|
| 268 |
+
"266": "hing",
|
| 269 |
+
"267": "enty",
|
| 270 |
+
"268": "age",
|
| 271 |
+
"269": "ber",
|
| 272 |
+
"270": "ice",
|
| 273 |
+
"271": "▁am",
|
| 274 |
+
"272": "ple",
|
| 275 |
+
"273": "are",
|
| 276 |
+
"274": "▁per",
|
| 277 |
+
"275": "und",
|
| 278 |
+
"276": "ite",
|
| 279 |
+
"277": "ix",
|
| 280 |
+
"278": "pl",
|
| 281 |
+
"279": "▁way",
|
| 282 |
+
"280": "▁did",
|
| 283 |
+
"281": "▁pr",
|
| 284 |
+
"282": "▁got",
|
| 285 |
+
"283": "ars",
|
| 286 |
+
"284": "▁she",
|
| 287 |
+
"285": "▁let",
|
| 288 |
+
"286": "ag",
|
| 289 |
+
"287": "▁ac",
|
| 290 |
+
"288": "int",
|
| 291 |
+
"289": "▁ar",
|
| 292 |
+
"290": "ry",
|
| 293 |
+
"291": "ign",
|
| 294 |
+
"292": "ish",
|
| 295 |
+
"293": "▁fir",
|
| 296 |
+
"294": "ace",
|
| 297 |
+
"295": "ble",
|
| 298 |
+
"296": "og",
|
| 299 |
+
"297": "ue",
|
| 300 |
+
"298": "▁ye",
|
| 301 |
+
"299": "ap",
|
| 302 |
+
"300": "iff",
|
| 303 |
+
"301": "▁ro",
|
| 304 |
+
"302": "▁her",
|
| 305 |
+
"303": "nder",
|
| 306 |
+
"304": "▁ok",
|
| 307 |
+
"305": "▁res",
|
| 308 |
+
"306": "▁gu",
|
| 309 |
+
"307": "ence",
|
| 310 |
+
"308": "▁may",
|
| 311 |
+
"309": "ated",
|
| 312 |
+
"310": "ip",
|
| 313 |
+
"311": "▁bo",
|
| 314 |
+
"312": "▁him",
|
| 315 |
+
"313": "way",
|
| 316 |
+
"314": "ac",
|
| 317 |
+
"315": "ical",
|
| 318 |
+
"316": "ass",
|
| 319 |
+
"317": "ase",
|
| 320 |
+
"318": "▁dis",
|
| 321 |
+
"319": "able",
|
| 322 |
+
"320": "ick",
|
| 323 |
+
"321": "▁app",
|
| 324 |
+
"322": "ance",
|
| 325 |
+
"323": "▁pre",
|
| 326 |
+
"324": "▁six",
|
| 327 |
+
"325": "▁off",
|
| 328 |
+
"326": "▁new",
|
| 329 |
+
"327": "ia",
|
| 330 |
+
"328": "orm",
|
| 331 |
+
"329": "ank",
|
| 332 |
+
"330": "▁lot",
|
| 333 |
+
"331": "ach",
|
| 334 |
+
"332": "▁fo",
|
| 335 |
+
"333": "inet",
|
| 336 |
+
"334": "ire",
|
| 337 |
+
"335": "ary",
|
| 338 |
+
"336": "ult",
|
| 339 |
+
"337": "▁tal",
|
| 340 |
+
"338": "▁mu",
|
| 341 |
+
"339": "▁bl",
|
| 342 |
+
"340": "ount",
|
| 343 |
+
"341": "sel",
|
| 344 |
+
"342": "vel",
|
| 345 |
+
"343": "▁br",
|
| 346 |
+
"344": "▁imp",
|
| 347 |
+
"345": "ep",
|
| 348 |
+
"346": "cess",
|
| 349 |
+
"347": "ord",
|
| 350 |
+
"348": "▁sc",
|
| 351 |
+
"349": "▁inc",
|
| 352 |
+
"350": "ound",
|
| 353 |
+
"351": "ang",
|
| 354 |
+
"352": "be",
|
| 355 |
+
"353": "ress",
|
| 356 |
+
"354": "uct",
|
| 357 |
+
"355": "▁ind",
|
| 358 |
+
"356": "▁af",
|
| 359 |
+
"357": "ving",
|
| 360 |
+
"358": "▁oh",
|
| 361 |
+
"359": "▁bet",
|
| 362 |
+
"360": "▁use",
|
| 363 |
+
"361": "ome",
|
| 364 |
+
"362": "ens",
|
| 365 |
+
"363": "ys",
|
| 366 |
+
"364": "▁bu",
|
| 367 |
+
"365": "co",
|
| 368 |
+
"366": "ory",
|
| 369 |
+
"367": "ater",
|
| 370 |
+
"368": "ild",
|
| 371 |
+
"369": "ght",
|
| 372 |
+
"370": "ial",
|
| 373 |
+
"371": "▁day",
|
| 374 |
+
"372": "ning",
|
| 375 |
+
"373": "na",
|
| 376 |
+
"374": "ile",
|
| 377 |
+
"375": "▁spe",
|
| 378 |
+
"376": "▁mar",
|
| 379 |
+
"377": "ody",
|
| 380 |
+
"378": "ough",
|
| 381 |
+
"379": "ade",
|
| 382 |
+
"380": "vers",
|
| 383 |
+
"381": "xt",
|
| 384 |
+
"382": "▁fl",
|
| 385 |
+
"383": "▁ke",
|
| 386 |
+
"384": "ian",
|
| 387 |
+
"385": "▁sy",
|
| 388 |
+
"386": "▁put",
|
| 389 |
+
"387": "fore",
|
| 390 |
+
"388": "ub",
|
| 391 |
+
"389": "▁ph",
|
| 392 |
+
"390": "fe",
|
| 393 |
+
"391": "▁em",
|
| 394 |
+
"392": "▁ser",
|
| 395 |
+
"393": "form",
|
| 396 |
+
"394": "ting",
|
| 397 |
+
"395": "te",
|
| 398 |
+
"396": "av",
|
| 399 |
+
"397": "ious",
|
| 400 |
+
"398": "▁rec",
|
| 401 |
+
"399": "ks",
|
| 402 |
+
"400": "▁gr",
|
| 403 |
+
"401": "ces",
|
| 404 |
+
"402": "wn",
|
| 405 |
+
"403": "ors",
|
| 406 |
+
"404": "▁jo",
|
| 407 |
+
"405": "ents",
|
| 408 |
+
"406": "▁des",
|
| 409 |
+
"407": "▁try",
|
| 410 |
+
"408": "▁equ",
|
| 411 |
+
"409": "▁z",
|
| 412 |
+
"410": "▁rem",
|
| 413 |
+
"411": "▁str",
|
| 414 |
+
"412": "self",
|
| 415 |
+
"413": "▁bit",
|
| 416 |
+
"414": "ph",
|
| 417 |
+
"415": "ved",
|
| 418 |
+
"416": "▁why",
|
| 419 |
+
"417": "▁bas",
|
| 420 |
+
"418": "▁hel",
|
| 421 |
+
"419": "▁rel",
|
| 422 |
+
"420": "ath",
|
| 423 |
+
"421": "ject",
|
| 424 |
+
"422": "ail",
|
| 425 |
+
"423": "▁la",
|
| 426 |
+
"424": "ual",
|
| 427 |
+
"425": "▁god",
|
| 428 |
+
"426": "▁nat",
|
| 429 |
+
"427": "erm",
|
| 430 |
+
"428": "day",
|
| 431 |
+
"429": "▁id",
|
| 432 |
+
"430": "ft",
|
| 433 |
+
"431": "▁wr",
|
| 434 |
+
"432": "▁min",
|
| 435 |
+
"433": "ates",
|
| 436 |
+
"434": "▁gen",
|
| 437 |
+
"435": "tain",
|
| 438 |
+
"436": "▁ob",
|
| 439 |
+
"437": "ull",
|
| 440 |
+
"438": "ict",
|
| 441 |
+
"439": "▁tra",
|
| 442 |
+
"440": "▁end",
|
| 443 |
+
"441": "▁hig",
|
| 444 |
+
"442": "▁fif",
|
| 445 |
+
"443": "oth",
|
| 446 |
+
"444": "tern",
|
| 447 |
+
"445": "▁its",
|
| 448 |
+
"446": "vent",
|
| 449 |
+
"447": "▁sm",
|
| 450 |
+
"448": "ons",
|
| 451 |
+
"449": "▁add",
|
| 452 |
+
"450": "iss",
|
| 453 |
+
"451": "▁bel",
|
| 454 |
+
"452": "ful",
|
| 455 |
+
"453": "get",
|
| 456 |
+
"454": "▁ele",
|
| 457 |
+
"455": "▁rep",
|
| 458 |
+
"456": "ak",
|
| 459 |
+
"457": "▁ho",
|
| 460 |
+
"458": "▁pos",
|
| 461 |
+
"459": "▁num",
|
| 462 |
+
"460": "ange",
|
| 463 |
+
"461": "ves",
|
| 464 |
+
"462": "ific",
|
| 465 |
+
"463": "urn",
|
| 466 |
+
"464": "ise",
|
| 467 |
+
"465": "▁cr",
|
| 468 |
+
"466": "▁um",
|
| 469 |
+
"467": "ward",
|
| 470 |
+
"468": "▁reg",
|
| 471 |
+
"469": "ady",
|
| 472 |
+
"470": "ower",
|
| 473 |
+
"471": "uc",
|
| 474 |
+
"472": "▁dec",
|
| 475 |
+
"473": "lic",
|
| 476 |
+
"474": "▁set",
|
| 477 |
+
"475": "▁gon",
|
| 478 |
+
"476": "▁op",
|
| 479 |
+
"477": "▁ear",
|
| 480 |
+
"478": "▁sub",
|
| 481 |
+
"479": "▁sl",
|
| 482 |
+
"480": "les",
|
| 483 |
+
"481": "stem",
|
| 484 |
+
"482": "cial",
|
| 485 |
+
"483": "olog",
|
| 486 |
+
"484": "atch",
|
| 487 |
+
"485": "ily",
|
| 488 |
+
"486": "body",
|
| 489 |
+
"487": "nds",
|
| 490 |
+
"488": "ular",
|
| 491 |
+
"489": "ren",
|
| 492 |
+
"490": "▁own",
|
| 493 |
+
"491": "▁too",
|
| 494 |
+
"492": "cent",
|
| 495 |
+
"493": "ible",
|
| 496 |
+
"494": "pect",
|
| 497 |
+
"495": "ered",
|
| 498 |
+
"496": "ways",
|
| 499 |
+
"497": "teen",
|
| 500 |
+
"498": "▁uh",
|
| 501 |
+
"499": "▁big",
|
| 502 |
+
"500": "▁mod",
|
| 503 |
+
"501": "▁att",
|
| 504 |
+
"502": "▁car",
|
| 505 |
+
"503": "gr",
|
| 506 |
+
"504": "▁acc",
|
| 507 |
+
"505": "ied",
|
| 508 |
+
"506": "mun",
|
| 509 |
+
"507": "ib",
|
| 510 |
+
"508": "▁mon",
|
| 511 |
+
"509": "▁sch",
|
| 512 |
+
"510": "▁pol",
|
| 513 |
+
"511": "▁dat",
|
| 514 |
+
"512": "▁fin",
|
| 515 |
+
"513": "▁sim",
|
| 516 |
+
"514": "▁inv",
|
| 517 |
+
"515": "▁def",
|
| 518 |
+
"516": "ked",
|
| 519 |
+
"517": "▁ent",
|
| 520 |
+
"518": "▁yes",
|
| 521 |
+
"519": "ows",
|
| 522 |
+
"520": "ics",
|
| 523 |
+
"521": "ited",
|
| 524 |
+
"522": "ute",
|
| 525 |
+
"523": "ism",
|
| 526 |
+
"524": "ps",
|
| 527 |
+
"525": "▁ed",
|
| 528 |
+
"526": "▁el",
|
| 529 |
+
"527": "ably",
|
| 530 |
+
"528": "ppen",
|
| 531 |
+
"529": "als",
|
| 532 |
+
"530": "▁ten",
|
| 533 |
+
"531": "ract",
|
| 534 |
+
"532": "ss",
|
| 535 |
+
"533": "▁ass",
|
| 536 |
+
"534": "▁met",
|
| 537 |
+
"535": "gan",
|
| 538 |
+
"536": "▁eng",
|
| 539 |
+
"537": "▁stu",
|
| 540 |
+
"538": "ween",
|
| 541 |
+
"539": "arch",
|
| 542 |
+
"540": "▁gl",
|
| 543 |
+
"541": "▁cor",
|
| 544 |
+
"542": "▁dr",
|
| 545 |
+
"543": "vern",
|
| 546 |
+
"544": "▁ty",
|
| 547 |
+
"545": "▁run",
|
| 548 |
+
"546": "hip",
|
| 549 |
+
"547": "cus",
|
| 550 |
+
"548": "cond",
|
| 551 |
+
"549": "▁ins",
|
| 552 |
+
"550": "irty",
|
| 553 |
+
"551": "▁pub",
|
| 554 |
+
"552": "lud",
|
| 555 |
+
"553": "llow",
|
| 556 |
+
"554": "▁cou",
|
| 557 |
+
"555": "ew",
|
| 558 |
+
"556": "iew",
|
| 559 |
+
"557": "▁sur",
|
| 560 |
+
"558": "ero",
|
| 561 |
+
"559": "ood",
|
| 562 |
+
"560": "ness",
|
| 563 |
+
"561": "▁fun",
|
| 564 |
+
"562": "▁eff",
|
| 565 |
+
"563": "cept",
|
| 566 |
+
"564": "▁ca",
|
| 567 |
+
"565": "▁exp",
|
| 568 |
+
"566": "duct",
|
| 569 |
+
"567": "▁sw",
|
| 570 |
+
"568": "ize",
|
| 571 |
+
"569": "ope",
|
| 572 |
+
"570": "▁par",
|
| 573 |
+
"571": "kes",
|
| 574 |
+
"572": "cy",
|
| 575 |
+
"573": "▁ev",
|
| 576 |
+
"574": "▁ref",
|
| 577 |
+
"575": "ell",
|
| 578 |
+
"576": "▁bus",
|
| 579 |
+
"577": "ug",
|
| 580 |
+
"578": "rib",
|
| 581 |
+
"579": "▁cur",
|
| 582 |
+
"580": "mo",
|
| 583 |
+
"581": "ock",
|
| 584 |
+
"582": "ures",
|
| 585 |
+
"583": "air",
|
| 586 |
+
"584": "▁war",
|
| 587 |
+
"585": "str",
|
| 588 |
+
"586": "▁med",
|
| 589 |
+
"587": "▁wa",
|
| 590 |
+
"588": "▁val",
|
| 591 |
+
"589": "▁sin",
|
| 592 |
+
"590": "blem",
|
| 593 |
+
"591": "▁fam",
|
| 594 |
+
"592": "li",
|
| 595 |
+
"593": "▁far",
|
| 596 |
+
"594": "▁cle",
|
| 597 |
+
"595": "▁col",
|
| 598 |
+
"596": "mon",
|
| 599 |
+
"597": "▁gra",
|
| 600 |
+
"598": "led",
|
| 601 |
+
"599": "ense",
|
| 602 |
+
"600": "tin",
|
| 603 |
+
"601": "ues",
|
| 604 |
+
"602": "its",
|
| 605 |
+
"603": "▁mem",
|
| 606 |
+
"604": "▁inf",
|
| 607 |
+
"605": "▁eas",
|
| 608 |
+
"606": "ideo",
|
| 609 |
+
"607": "▁top",
|
| 610 |
+
"608": "io",
|
| 611 |
+
"609": "pan",
|
| 612 |
+
"610": "▁hum",
|
| 613 |
+
"611": "▁old",
|
| 614 |
+
"612": "ead",
|
| 615 |
+
"613": "▁ord",
|
| 616 |
+
"614": "ric",
|
| 617 |
+
"615": "ants",
|
| 618 |
+
"616": "oy",
|
| 619 |
+
"617": "esn",
|
| 620 |
+
"618": "uck",
|
| 621 |
+
"619": "ason",
|
| 622 |
+
"620": "ced",
|
| 623 |
+
"621": "ool",
|
| 624 |
+
"622": "rat",
|
| 625 |
+
"623": "ouse",
|
| 626 |
+
"624": "▁lar",
|
| 627 |
+
"625": "▁art",
|
| 628 |
+
"626": "▁wee",
|
| 629 |
+
"627": "▁cer",
|
| 630 |
+
"628": "ized",
|
| 631 |
+
"629": "▁mat",
|
| 632 |
+
"630": "con",
|
| 633 |
+
"631": "erg",
|
| 634 |
+
"632": "land",
|
| 635 |
+
"633": "ines",
|
| 636 |
+
"634": "▁chr",
|
| 637 |
+
"635": "▁aut",
|
| 638 |
+
"636": "▁lea",
|
| 639 |
+
"637": "▁sou",
|
| 640 |
+
"638": "oney",
|
| 641 |
+
"639": "tty",
|
| 642 |
+
"640": "▁ple",
|
| 643 |
+
"641": "ulat",
|
| 644 |
+
"642": "oks",
|
| 645 |
+
"643": "▁few",
|
| 646 |
+
"644": "▁sol",
|
| 647 |
+
"645": "▁che",
|
| 648 |
+
"646": "chn",
|
| 649 |
+
"647": "ird",
|
| 650 |
+
"648": "▁bre",
|
| 651 |
+
"649": "▁dur",
|
| 652 |
+
"650": "▁wom",
|
| 653 |
+
"651": "me",
|
| 654 |
+
"652": "izat",
|
| 655 |
+
"653": "eric",
|
| 656 |
+
"654": "ote",
|
| 657 |
+
"655": "▁uni",
|
| 658 |
+
"656": "eren",
|
| 659 |
+
"657": "arn",
|
| 660 |
+
"658": "ross",
|
| 661 |
+
"659": "ices",
|
| 662 |
+
"660": "ten",
|
| 663 |
+
"661": "eral",
|
| 664 |
+
"662": "ever",
|
| 665 |
+
"663": "ieve",
|
| 666 |
+
"664": "lish",
|
| 667 |
+
"665": "ash",
|
| 668 |
+
"666": "▁opp",
|
| 669 |
+
"667": "alth",
|
| 670 |
+
"668": "ger",
|
| 671 |
+
"669": "▁sk",
|
| 672 |
+
"670": "▁red",
|
| 673 |
+
"671": "peri",
|
| 674 |
+
"672": "▁det",
|
| 675 |
+
"673": "▁ext",
|
| 676 |
+
"674": "ner",
|
| 677 |
+
"675": "ah",
|
| 678 |
+
"676": "▁var",
|
| 679 |
+
"677": "▁loc",
|
| 680 |
+
"678": "gram",
|
| 681 |
+
"679": "ists",
|
| 682 |
+
"680": "ives",
|
| 683 |
+
"681": "▁es",
|
| 684 |
+
"682": "▁nor",
|
| 685 |
+
"683": "tro",
|
| 686 |
+
"684": "ale",
|
| 687 |
+
"685": "▁iss",
|
| 688 |
+
"686": "▁pri",
|
| 689 |
+
"687": "gin",
|
| 690 |
+
"688": "az",
|
| 691 |
+
"689": "oc",
|
| 692 |
+
"690": "▁pop",
|
| 693 |
+
"691": "ern",
|
| 694 |
+
"692": "▁sit",
|
| 695 |
+
"693": "ket",
|
| 696 |
+
"694": "▁pa",
|
| 697 |
+
"695": "▁law",
|
| 698 |
+
"696": "ages",
|
| 699 |
+
"697": "br",
|
| 700 |
+
"698": "▁cam",
|
| 701 |
+
"699": "▁mom",
|
| 702 |
+
"700": "osed",
|
| 703 |
+
"701": "▁bro",
|
| 704 |
+
"702": "ne",
|
| 705 |
+
"703": "bs",
|
| 706 |
+
"704": "▁cre",
|
| 707 |
+
"705": "erat",
|
| 708 |
+
"706": "▁sec",
|
| 709 |
+
"707": "▁cap",
|
| 710 |
+
"708": "▁vis",
|
| 711 |
+
"709": "▁pat",
|
| 712 |
+
"710": "ield",
|
| 713 |
+
"711": "iet",
|
| 714 |
+
"712": "▁tri",
|
| 715 |
+
"713": "up",
|
| 716 |
+
"714": "▁bra",
|
| 717 |
+
"715": "ts",
|
| 718 |
+
"716": "▁mot",
|
| 719 |
+
"717": "▁unt",
|
| 720 |
+
"718": "put",
|
| 721 |
+
"719": "bo",
|
| 722 |
+
"720": "ork",
|
| 723 |
+
"721": "mer",
|
| 724 |
+
"722": "ital",
|
| 725 |
+
"723": "▁air",
|
| 726 |
+
"724": "ined",
|
| 727 |
+
"725": "▁beh",
|
| 728 |
+
"726": "▁adv",
|
| 729 |
+
"727": "▁ret",
|
| 730 |
+
"728": "imes",
|
| 731 |
+
"729": "▁tea",
|
| 732 |
+
"730": "ural",
|
| 733 |
+
"731": "sid",
|
| 734 |
+
"732": "ters",
|
| 735 |
+
"733": "▁pur",
|
| 736 |
+
"734": "▁sci",
|
| 737 |
+
"735": "bers",
|
| 738 |
+
"736": "ient",
|
| 739 |
+
"737": "ier",
|
| 740 |
+
"738": "cc",
|
| 741 |
+
"739": "sw",
|
| 742 |
+
"740": "▁av",
|
| 743 |
+
"741": "reen",
|
| 744 |
+
"742": "ode",
|
| 745 |
+
"743": "ont",
|
| 746 |
+
"744": "▁dra",
|
| 747 |
+
"745": "ann",
|
| 748 |
+
"746": "nect",
|
| 749 |
+
"747": "▁x",
|
| 750 |
+
"748": "▁eu",
|
| 751 |
+
"749": "ton",
|
| 752 |
+
"750": "inat",
|
| 753 |
+
"751": "ene",
|
| 754 |
+
"752": "ared",
|
| 755 |
+
"753": "els",
|
| 756 |
+
"754": "▁mor",
|
| 757 |
+
"755": "▁rat",
|
| 758 |
+
"756": "cri",
|
| 759 |
+
"757": "▁men",
|
| 760 |
+
"758": "▁ah",
|
| 761 |
+
"759": "ames",
|
| 762 |
+
"760": "▁arm",
|
| 763 |
+
"761": "eak",
|
| 764 |
+
"762": "▁pay",
|
| 765 |
+
"763": "▁hal",
|
| 766 |
+
"764": "ins",
|
| 767 |
+
"765": "ilit",
|
| 768 |
+
"766": "stit",
|
| 769 |
+
"767": "▁ra",
|
| 770 |
+
"768": "▁leg",
|
| 771 |
+
"769": "cl",
|
| 772 |
+
"770": "pr",
|
| 773 |
+
"771": "▁wal",
|
| 774 |
+
"772": "▁bad",
|
| 775 |
+
"773": "▁ge",
|
| 776 |
+
"774": "roup",
|
| 777 |
+
"775": "▁mus",
|
| 778 |
+
"776": "man",
|
| 779 |
+
"777": "▁gi",
|
| 780 |
+
"778": "eds",
|
| 781 |
+
"779": "▁aw",
|
| 782 |
+
"780": "po",
|
| 783 |
+
"781": "ark",
|
| 784 |
+
"782": "row",
|
| 785 |
+
"783": "▁dep",
|
| 786 |
+
"784": "ully",
|
| 787 |
+
"785": "ral",
|
| 788 |
+
"786": "lect",
|
| 789 |
+
"787": "pend",
|
| 790 |
+
"788": "▁sev",
|
| 791 |
+
"789": "ime",
|
| 792 |
+
"790": "gest",
|
| 793 |
+
"791": "here",
|
| 794 |
+
"792": "▁yet",
|
| 795 |
+
"793": "ted",
|
| 796 |
+
"794": "▁rev",
|
| 797 |
+
"795": "ds",
|
| 798 |
+
"796": "▁ask",
|
| 799 |
+
"797": "less",
|
| 800 |
+
"798": "▁di",
|
| 801 |
+
"799": "ets",
|
| 802 |
+
"800": "line",
|
| 803 |
+
"801": "▁aff",
|
| 804 |
+
"802": "ired",
|
| 805 |
+
"803": "▁est",
|
| 806 |
+
"804": "ken",
|
| 807 |
+
"805": "vid",
|
| 808 |
+
"806": "most",
|
| 809 |
+
"807": "ivid",
|
| 810 |
+
"808": "unch",
|
| 811 |
+
"809": "par",
|
| 812 |
+
"810": "med",
|
| 813 |
+
"811": "rop",
|
| 814 |
+
"812": "ased",
|
| 815 |
+
"813": "eone",
|
| 816 |
+
"814": "▁ve",
|
| 817 |
+
"815": "▁abs",
|
| 818 |
+
"816": "ergy",
|
| 819 |
+
"817": "ret",
|
| 820 |
+
"818": "▁saw",
|
| 821 |
+
"819": "▁ey",
|
| 822 |
+
"820": "▁cal",
|
| 823 |
+
"821": "uat",
|
| 824 |
+
"822": "▁mid",
|
| 825 |
+
"823": "vat",
|
| 826 |
+
"824": "ream",
|
| 827 |
+
"825": "vice",
|
| 828 |
+
"826": "ians",
|
| 829 |
+
"827": "rent",
|
| 830 |
+
"828": "ctor",
|
| 831 |
+
"829": "err",
|
| 832 |
+
"830": "ush",
|
| 833 |
+
"831": "ases",
|
| 834 |
+
"832": "▁suc",
|
| 835 |
+
"833": "erms",
|
| 836 |
+
"834": "ave",
|
| 837 |
+
"835": "angu",
|
| 838 |
+
"836": "ries",
|
| 839 |
+
"837": "▁wo",
|
| 840 |
+
"838": "arts",
|
| 841 |
+
"839": "▁fil",
|
| 842 |
+
"840": "▁fat",
|
| 843 |
+
"841": "▁cho",
|
| 844 |
+
"842": "orts",
|
| 845 |
+
"843": "▁fre",
|
| 846 |
+
"844": "ee",
|
| 847 |
+
"845": "ught",
|
| 848 |
+
"846": "eng",
|
| 849 |
+
"847": "ump",
|
| 850 |
+
"848": "▁bar",
|
| 851 |
+
"849": "ying",
|
| 852 |
+
"850": "ane",
|
| 853 |
+
"851": "▁tem",
|
| 854 |
+
"852": "anks",
|
| 855 |
+
"853": "ury",
|
| 856 |
+
"854": "iat",
|
| 857 |
+
"855": "mit",
|
| 858 |
+
"856": "trol",
|
| 859 |
+
"857": "▁net",
|
| 860 |
+
"858": "▁maj",
|
| 861 |
+
"859": "▁cra",
|
| 862 |
+
"860": "ling",
|
| 863 |
+
"861": "▁fig",
|
| 864 |
+
"862": "orn",
|
| 865 |
+
"863": "icat",
|
| 866 |
+
"864": "pany",
|
| 867 |
+
"865": "▁occ",
|
| 868 |
+
"866": "ott",
|
| 869 |
+
"867": "ands",
|
| 870 |
+
"868": "▁exc",
|
| 871 |
+
"869": "▁mr",
|
| 872 |
+
"870": "ency",
|
| 873 |
+
"871": "rope",
|
| 874 |
+
"872": "itch",
|
| 875 |
+
"873": "▁lit",
|
| 876 |
+
"874": "abil",
|
| 877 |
+
"875": "not",
|
| 878 |
+
"876": "ma",
|
| 879 |
+
"877": "▁typ",
|
| 880 |
+
"878": "▁opt",
|
| 881 |
+
"879": "ob",
|
| 882 |
+
"880": "ser",
|
| 883 |
+
"881": "ety",
|
| 884 |
+
"882": "ms",
|
| 885 |
+
"883": "peci",
|
| 886 |
+
"884": "aces",
|
| 887 |
+
"885": "aut",
|
| 888 |
+
"886": "▁hon",
|
| 889 |
+
"887": "cuss",
|
| 890 |
+
"888": "▁sal",
|
| 891 |
+
"889": "▁sor",
|
| 892 |
+
"890": "att",
|
| 893 |
+
"891": "▁lab",
|
| 894 |
+
"892": "▁har",
|
| 895 |
+
"893": "urch",
|
| 896 |
+
"894": "nded",
|
| 897 |
+
"895": "uce",
|
| 898 |
+
"896": "ids",
|
| 899 |
+
"897": "▁hy",
|
| 900 |
+
"898": "▁fut",
|
| 901 |
+
"899": "▁ste",
|
| 902 |
+
"900": "ours",
|
| 903 |
+
"901": "ems",
|
| 904 |
+
"902": "utes",
|
| 905 |
+
"903": "ng",
|
| 906 |
+
"904": "ta",
|
| 907 |
+
"905": "▁won",
|
| 908 |
+
"906": "▁fa",
|
| 909 |
+
"907": "▁env",
|
| 910 |
+
"908": "ards",
|
| 911 |
+
"909": "▁job",
|
| 912 |
+
"910": "ium",
|
| 913 |
+
"911": "▁dot",
|
| 914 |
+
"912": "▁obv",
|
| 915 |
+
"913": "ina",
|
| 916 |
+
"914": "side",
|
| 917 |
+
"915": "elve",
|
| 918 |
+
"916": "cu",
|
| 919 |
+
"917": "▁jes",
|
| 920 |
+
"918": "▁pot",
|
| 921 |
+
"919": "▁pie",
|
| 922 |
+
"920": "▁tre",
|
| 923 |
+
"921": "▁hey",
|
| 924 |
+
"922": "▁mag",
|
| 925 |
+
"923": "ron",
|
| 926 |
+
"924": "▁key",
|
| 927 |
+
"925": "swer",
|
| 928 |
+
"926": "▁win",
|
| 929 |
+
"927": "ucat",
|
| 930 |
+
"928": "work",
|
| 931 |
+
"929": "ides",
|
| 932 |
+
"930": "▁low",
|
| 933 |
+
"931": "▁vol",
|
| 934 |
+
"932": "▁oth",
|
| 935 |
+
"933": "atic",
|
| 936 |
+
"934": "lf",
|
| 937 |
+
"935": "ads",
|
| 938 |
+
"936": "inds",
|
| 939 |
+
"937": "com",
|
| 940 |
+
"938": "ths",
|
| 941 |
+
"939": "▁ver",
|
| 942 |
+
"940": "ised",
|
| 943 |
+
"941": "lo",
|
| 944 |
+
"942": "▁squ",
|
| 945 |
+
"943": "▁cut",
|
| 946 |
+
"944": "oked",
|
| 947 |
+
"945": "irit",
|
| 948 |
+
"946": "ateg",
|
| 949 |
+
"947": "ppy",
|
| 950 |
+
"948": "mitt",
|
| 951 |
+
"949": "come",
|
| 952 |
+
"950": "hn",
|
| 953 |
+
"951": "igin",
|
| 954 |
+
"952": "mand",
|
| 955 |
+
"953": "▁dam",
|
| 956 |
+
"954": "ho",
|
| 957 |
+
"955": "▁da",
|
| 958 |
+
"956": "▁fur",
|
| 959 |
+
"957": "iron",
|
| 960 |
+
"958": "ilar",
|
| 961 |
+
"959": "▁fac",
|
| 962 |
+
"960": "▁neg",
|
| 963 |
+
"961": "▁ago",
|
| 964 |
+
"962": "ged",
|
| 965 |
+
"963": "miss",
|
| 966 |
+
"964": "enth",
|
| 967 |
+
"965": "▁dou",
|
| 968 |
+
"966": "▁hit",
|
| 969 |
+
"967": "▁guy",
|
| 970 |
+
"968": "▁bi",
|
| 971 |
+
"969": "ove",
|
| 972 |
+
"970": "fess",
|
| 973 |
+
"971": "ples",
|
| 974 |
+
"972": "owed",
|
| 975 |
+
"973": "ured",
|
| 976 |
+
"974": "▁ris",
|
| 977 |
+
"975": "ints",
|
| 978 |
+
"976": "rew",
|
| 979 |
+
"977": "▁sum",
|
| 980 |
+
"978": "▁hu",
|
| 981 |
+
"979": "ploy",
|
| 982 |
+
"980": "ude",
|
| 983 |
+
"981": "ried",
|
| 984 |
+
"982": "▁cir",
|
| 985 |
+
"983": "▁dev",
|
| 986 |
+
"984": "ear",
|
| 987 |
+
"985": "▁tot",
|
| 988 |
+
"986": "▁ann",
|
| 989 |
+
"987": "duc",
|
| 990 |
+
"988": "ik",
|
| 991 |
+
"989": "pon",
|
| 992 |
+
"990": "sted",
|
| 993 |
+
"991": "▁ide",
|
| 994 |
+
"992": "▁'",
|
| 995 |
+
"993": "ipp",
|
| 996 |
+
"994": "▁eat",
|
| 997 |
+
"995": "▁dom",
|
| 998 |
+
"996": "▁",
|
| 999 |
+
"997": "e",
|
| 1000 |
+
"998": "t",
|
| 1001 |
+
"999": "o",
|
| 1002 |
+
"1000": "a",
|
| 1003 |
+
"1001": "i",
|
| 1004 |
+
"1002": "n",
|
| 1005 |
+
"1003": "s",
|
| 1006 |
+
"1004": "r",
|
| 1007 |
+
"1005": "h",
|
| 1008 |
+
"1006": "l",
|
| 1009 |
+
"1007": "d",
|
| 1010 |
+
"1008": "u",
|
| 1011 |
+
"1009": "c",
|
| 1012 |
+
"1010": "m",
|
| 1013 |
+
"1011": "y",
|
| 1014 |
+
"1012": "g",
|
| 1015 |
+
"1013": "w",
|
| 1016 |
+
"1014": "f",
|
| 1017 |
+
"1015": "p",
|
| 1018 |
+
"1016": "b",
|
| 1019 |
+
"1017": "v",
|
| 1020 |
+
"1018": "k",
|
| 1021 |
+
"1019": "'",
|
| 1022 |
+
"1020": "j",
|
| 1023 |
+
"1021": "x",
|
| 1024 |
+
"1022": "q",
|
| 1025 |
+
"1023": "z",
|
| 1026 |
+
"1024": "<EOU>",
|
| 1027 |
+
"1025": "<EOB>"
|
| 1028 |
+
}
|
160ms/.DS_Store
ADDED
|
Binary file (8.2 kB). View file
|
|
|
160ms/convert_parakeet_eou.py
ADDED
|
@@ -0,0 +1,740 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""CLI for exporting Parakeet Realtime EOU 120M components to CoreML.
|
| 3 |
+
|
| 4 |
+
This model is a cache-aware streaming FastConformer-RNNT model optimized for
|
| 5 |
+
low-latency speech recognition with end-of-utterance detection.
|
| 6 |
+
|
| 7 |
+
Key differences from Parakeet TDT v3:
|
| 8 |
+
- Smaller model (120M vs 600M params)
|
| 9 |
+
- No duration outputs (standard RNNT, not TDT)
|
| 10 |
+
- Cache-aware streaming encoder (17 layers, attention context [70,1])
|
| 11 |
+
- Special <EOU> token for end-of-utterance detection
|
| 12 |
+
- Optimized for 80-160ms latency
|
| 13 |
+
|
| 14 |
+
Reference: https://huggingface.co/nvidia/parakeet_realtime_eou_120m-v1
|
| 15 |
+
"""
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import json
|
| 19 |
+
from dataclasses import asdict
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from typing import Dict, Optional, Tuple
|
| 22 |
+
|
| 23 |
+
import coremltools as ct
|
| 24 |
+
import numpy as np
|
| 25 |
+
import soundfile as sf
|
| 26 |
+
import torch
|
| 27 |
+
import typer
|
| 28 |
+
|
| 29 |
+
import nemo.collections.asr as nemo_asr
|
| 30 |
+
|
| 31 |
+
from individual_components import (
|
| 32 |
+
DecoderWrapper,
|
| 33 |
+
EncoderWrapper,
|
| 34 |
+
ExportSettings,
|
| 35 |
+
JointWrapper,
|
| 36 |
+
JointDecisionWrapper,
|
| 37 |
+
JointDecisionSingleStep,
|
| 38 |
+
PreprocessorWrapper,
|
| 39 |
+
MelEncoderWrapper,
|
| 40 |
+
_coreml_convert,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
def apply_stft_patch():
|
| 44 |
+
# Monkey patch coremltools.stft to handle extra arguments from newer torch versions
|
| 45 |
+
try:
|
| 46 |
+
import coremltools.converters.mil.frontend.torch.ops as torch_ops
|
| 47 |
+
_original_stft = torch_ops.stft
|
| 48 |
+
|
| 49 |
+
def patched_stft(context, node):
|
| 50 |
+
if len(node.inputs) > 8:
|
| 51 |
+
node.inputs = node.inputs[:8]
|
| 52 |
+
return _original_stft(context, node)
|
| 53 |
+
|
| 54 |
+
torch_ops.stft = patched_stft
|
| 55 |
+
if "stft" in torch_ops._TORCH_OPS_REGISTRY:
|
| 56 |
+
torch_ops._TORCH_OPS_REGISTRY["stft"] = patched_stft
|
| 57 |
+
print("Monkey patched coremltools.stft for compatibility.")
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f"Warning: Could not monkey patch stft: {e}")
|
| 60 |
+
|
| 61 |
+
DEFAULT_MODEL_ID = "nvidia/parakeet_realtime_eou_120m-v1"
|
| 62 |
+
AUTHOR = "Fluid Inference"
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _compute_length(seconds: float, sample_rate: int) -> int:
|
| 66 |
+
return int(round(seconds * sample_rate))
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _prepare_audio(
|
| 70 |
+
validation_audio: Optional[Path],
|
| 71 |
+
sample_rate: int,
|
| 72 |
+
max_samples: int,
|
| 73 |
+
seed: Optional[int],
|
| 74 |
+
) -> torch.Tensor:
|
| 75 |
+
if validation_audio is None:
|
| 76 |
+
if seed is not None:
|
| 77 |
+
torch.manual_seed(seed)
|
| 78 |
+
audio = torch.randn(1, max_samples, dtype=torch.float32)
|
| 79 |
+
return audio
|
| 80 |
+
|
| 81 |
+
data, sr = sf.read(str(validation_audio), dtype="float32")
|
| 82 |
+
if sr != sample_rate:
|
| 83 |
+
raise typer.BadParameter(
|
| 84 |
+
f"Validation audio sample rate {sr} does not match model rate {sample_rate}"
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
if data.ndim > 1:
|
| 88 |
+
data = data[:, 0]
|
| 89 |
+
|
| 90 |
+
if data.size == 0:
|
| 91 |
+
raise typer.BadParameter("Validation audio is empty")
|
| 92 |
+
|
| 93 |
+
if data.size < max_samples:
|
| 94 |
+
pad_width = max_samples - data.size
|
| 95 |
+
data = np.pad(data, (0, pad_width))
|
| 96 |
+
elif data.size > max_samples:
|
| 97 |
+
data = data[:max_samples]
|
| 98 |
+
|
| 99 |
+
audio = torch.from_numpy(data).unsqueeze(0).to(dtype=torch.float32)
|
| 100 |
+
return audio
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def _save_mlpackage(model: ct.models.MLModel, path: Path, description: str) -> None:
|
| 104 |
+
try:
|
| 105 |
+
model.minimum_deployment_target = ct.target.iOS17
|
| 106 |
+
except Exception:
|
| 107 |
+
pass
|
| 108 |
+
model.short_description = description
|
| 109 |
+
model.author = AUTHOR
|
| 110 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 111 |
+
model.save(str(path))
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def _tensor_shape(tensor: torch.Tensor) -> Tuple[int, ...]:
|
| 115 |
+
return tuple(int(dim) for dim in tensor.shape)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _parse_compute_units(name: str) -> ct.ComputeUnit:
|
| 119 |
+
"""Parse a human-friendly compute units string into ct.ComputeUnit."""
|
| 120 |
+
normalized = str(name).strip().upper()
|
| 121 |
+
mapping = {
|
| 122 |
+
"ALL": ct.ComputeUnit.ALL,
|
| 123 |
+
"CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
|
| 124 |
+
"CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
|
| 125 |
+
"CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
|
| 126 |
+
"CPU_AND_NEURALENGINE": ct.ComputeUnit.CPU_AND_NE,
|
| 127 |
+
}
|
| 128 |
+
if normalized not in mapping:
|
| 129 |
+
raise typer.BadParameter(
|
| 130 |
+
f"Unknown compute units '{name}'. Choose from: " + ", ".join(mapping.keys())
|
| 131 |
+
)
|
| 132 |
+
return mapping[normalized]
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def _parse_compute_precision(name: Optional[str]) -> Optional[ct.precision]:
|
| 136 |
+
"""Parse compute precision string into ct.precision or None."""
|
| 137 |
+
if name is None:
|
| 138 |
+
return None
|
| 139 |
+
normalized = str(name).strip().upper()
|
| 140 |
+
if normalized == "":
|
| 141 |
+
return None
|
| 142 |
+
mapping = {
|
| 143 |
+
"FLOAT32": ct.precision.FLOAT32,
|
| 144 |
+
"FLOAT16": ct.precision.FLOAT16,
|
| 145 |
+
}
|
| 146 |
+
if normalized not in mapping:
|
| 147 |
+
raise typer.BadParameter(
|
| 148 |
+
f"Unknown compute precision '{name}'. Choose from: "
|
| 149 |
+
+ ", ".join(mapping.keys())
|
| 150 |
+
)
|
| 151 |
+
return mapping[normalized]
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
@app.command()
|
| 158 |
+
def convert(
|
| 159 |
+
nemo_path: Optional[Path] = typer.Option(
|
| 160 |
+
None,
|
| 161 |
+
"--nemo-path",
|
| 162 |
+
exists=True,
|
| 163 |
+
resolve_path=True,
|
| 164 |
+
help="Path to parakeet_realtime_eou_120m-v1.nemo checkpoint (skip to auto-download)",
|
| 165 |
+
),
|
| 166 |
+
model_id: str = typer.Option(
|
| 167 |
+
DEFAULT_MODEL_ID,
|
| 168 |
+
"--model-id",
|
| 169 |
+
help="Model identifier to download when --nemo-path is omitted",
|
| 170 |
+
),
|
| 171 |
+
output_dir: Path = typer.Option(
|
| 172 |
+
Path("parakeet_eou_coreml"),
|
| 173 |
+
help="Directory where mlpackages and metadata will be written",
|
| 174 |
+
),
|
| 175 |
+
preprocessor_cu: str = typer.Option(
|
| 176 |
+
"CPU_ONLY",
|
| 177 |
+
"--preprocessor-cu",
|
| 178 |
+
help="Compute units for preprocessor (default CPU_ONLY)",
|
| 179 |
+
),
|
| 180 |
+
mel_encoder_cu: str = typer.Option(
|
| 181 |
+
"CPU_ONLY",
|
| 182 |
+
"--mel-encoder-cu",
|
| 183 |
+
help="Compute units for fused mel+encoder (default CPU_ONLY)",
|
| 184 |
+
),
|
| 185 |
+
compute_precision: Optional[str] = typer.Option(
|
| 186 |
+
None,
|
| 187 |
+
"--compute-precision",
|
| 188 |
+
help="Export precision: FLOAT32 (default) or FLOAT16 to shrink non-quantized weights.",
|
| 189 |
+
),
|
| 190 |
+
max_audio_seconds: float = typer.Option(
|
| 191 |
+
15.0,
|
| 192 |
+
"--max-audio-seconds",
|
| 193 |
+
help="Maximum audio duration in seconds for the fixed window export",
|
| 194 |
+
),
|
| 195 |
+
validation_audio: Optional[Path] = typer.Option(
|
| 196 |
+
None,
|
| 197 |
+
"--validation-audio",
|
| 198 |
+
exists=True,
|
| 199 |
+
resolve_path=True,
|
| 200 |
+
help="Path to a 16kHz WAV file for tracing (uses random if not provided)",
|
| 201 |
+
),
|
| 202 |
+
) -> None:
|
| 203 |
+
"""Export all Parakeet Realtime EOU sub-modules to CoreML.
|
| 204 |
+
|
| 205 |
+
This exports the cache-aware streaming FastConformer-RNNT model for
|
| 206 |
+
low-latency speech recognition with end-of-utterance detection.
|
| 207 |
+
"""
|
| 208 |
+
export_settings = ExportSettings(
|
| 209 |
+
output_dir=output_dir,
|
| 210 |
+
compute_units=ct.ComputeUnit.CPU_ONLY,
|
| 211 |
+
deployment_target=ct.target.iOS17,
|
| 212 |
+
compute_precision=_parse_compute_precision(compute_precision),
|
| 213 |
+
max_audio_seconds=max_audio_seconds,
|
| 214 |
+
max_symbol_steps=1,
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
typer.echo("Export configuration:")
|
| 218 |
+
typer.echo(asdict(export_settings))
|
| 219 |
+
|
| 220 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 221 |
+
pre_cu = _parse_compute_units(preprocessor_cu)
|
| 222 |
+
melenc_cu = _parse_compute_units(mel_encoder_cu)
|
| 223 |
+
|
| 224 |
+
if nemo_path is not None:
|
| 225 |
+
typer.echo(f"Loading NeMo model from {nemo_path}…")
|
| 226 |
+
# Try loading as generic ASRModel first, then specific class
|
| 227 |
+
try:
|
| 228 |
+
asr_model = nemo_asr.models.ASRModel.restore_from(
|
| 229 |
+
str(nemo_path), map_location="cpu"
|
| 230 |
+
)
|
| 231 |
+
except Exception:
|
| 232 |
+
# Fallback to EncDecRNNTBPEModel
|
| 233 |
+
asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
|
| 234 |
+
str(nemo_path), map_location="cpu"
|
| 235 |
+
)
|
| 236 |
+
checkpoint_meta = {
|
| 237 |
+
"type": "file",
|
| 238 |
+
"path": str(nemo_path),
|
| 239 |
+
}
|
| 240 |
+
else:
|
| 241 |
+
typer.echo(f"Downloading NeMo model via {model_id}…")
|
| 242 |
+
# Use ASRModel.from_pretrained as recommended for this model
|
| 243 |
+
try:
|
| 244 |
+
asr_model = nemo_asr.models.ASRModel.from_pretrained(
|
| 245 |
+
model_id, map_location="cpu"
|
| 246 |
+
)
|
| 247 |
+
except Exception:
|
| 248 |
+
# Fallback to EncDecRNNTBPEModel
|
| 249 |
+
asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
|
| 250 |
+
model_id, map_location="cpu"
|
| 251 |
+
)
|
| 252 |
+
checkpoint_meta = {
|
| 253 |
+
"type": "pretrained",
|
| 254 |
+
"model_id": model_id,
|
| 255 |
+
}
|
| 256 |
+
asr_model.eval()
|
| 257 |
+
|
| 258 |
+
# Print model info
|
| 259 |
+
typer.echo(f"Model class: {type(asr_model).__name__}")
|
| 260 |
+
typer.echo(f"Encoder class: {type(asr_model.encoder).__name__}")
|
| 261 |
+
|
| 262 |
+
sample_rate = int(asr_model.cfg.preprocessor.sample_rate)
|
| 263 |
+
max_samples = _compute_length(export_settings.max_audio_seconds, sample_rate)
|
| 264 |
+
|
| 265 |
+
# Prepare audio for tracing
|
| 266 |
+
if validation_audio is not None:
|
| 267 |
+
typer.echo(f"Using validation audio: {validation_audio}")
|
| 268 |
+
audio_tensor = _prepare_audio(validation_audio, sample_rate, max_samples, seed=None)
|
| 269 |
+
else:
|
| 270 |
+
typer.echo("Using random audio for tracing (seed=42)")
|
| 271 |
+
audio_tensor = _prepare_audio(None, sample_rate, max_samples, seed=42)
|
| 272 |
+
|
| 273 |
+
audio_length = torch.tensor([max_samples], dtype=torch.int32)
|
| 274 |
+
|
| 275 |
+
preprocessor = PreprocessorWrapper(asr_model.preprocessor.eval())
|
| 276 |
+
encoder = EncoderWrapper(asr_model.encoder.eval())
|
| 277 |
+
decoder = DecoderWrapper(asr_model.decoder.eval())
|
| 278 |
+
joint = JointWrapper(asr_model.joint.eval())
|
| 279 |
+
|
| 280 |
+
decoder_export_flag = getattr(asr_model.decoder, "_rnnt_export", False)
|
| 281 |
+
asr_model.decoder._rnnt_export = True
|
| 282 |
+
|
| 283 |
+
try:
|
| 284 |
+
with torch.no_grad():
|
| 285 |
+
mel_ref, mel_length_ref = preprocessor(audio_tensor, audio_length)
|
| 286 |
+
mel_length_ref = mel_length_ref.to(dtype=torch.int32)
|
| 287 |
+
encoder_ref, encoder_length_ref, frame_times_ref = encoder(
|
| 288 |
+
mel_ref, mel_length_ref
|
| 289 |
+
)
|
| 290 |
+
encoder_length_ref = encoder_length_ref.to(dtype=torch.int32)
|
| 291 |
+
|
| 292 |
+
# Clone tensors to drop inference flags
|
| 293 |
+
mel_ref = mel_ref.clone().detach()
|
| 294 |
+
mel_length_ref = mel_length_ref.clone().detach()
|
| 295 |
+
encoder_ref = encoder_ref.clone().detach()
|
| 296 |
+
encoder_length_ref = encoder_length_ref.clone().detach()
|
| 297 |
+
frame_times_ref = frame_times_ref.clone().detach()
|
| 298 |
+
|
| 299 |
+
vocab_size = int(asr_model.tokenizer.vocab_size)
|
| 300 |
+
decoder_hidden = int(asr_model.decoder.pred_hidden)
|
| 301 |
+
decoder_layers = int(asr_model.decoder.pred_rnn_layers)
|
| 302 |
+
|
| 303 |
+
# Check if model has extra outputs (TDT-style duration)
|
| 304 |
+
num_extra = getattr(asr_model.joint, "num_extra_outputs", 0)
|
| 305 |
+
typer.echo(f"Vocab size: {vocab_size}, num_extra_outputs: {num_extra}")
|
| 306 |
+
|
| 307 |
+
targets = torch.full(
|
| 308 |
+
(1, export_settings.max_symbol_steps),
|
| 309 |
+
fill_value=asr_model.decoder.blank_idx,
|
| 310 |
+
dtype=torch.int32,
|
| 311 |
+
)
|
| 312 |
+
target_lengths = torch.tensor(
|
| 313 |
+
[export_settings.max_symbol_steps], dtype=torch.int32
|
| 314 |
+
)
|
| 315 |
+
zero_state = torch.zeros(
|
| 316 |
+
decoder_layers,
|
| 317 |
+
1,
|
| 318 |
+
decoder_hidden,
|
| 319 |
+
dtype=torch.float32,
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
with torch.no_grad():
|
| 323 |
+
decoder_ref, h_ref, c_ref = decoder(
|
| 324 |
+
targets, target_lengths, zero_state, zero_state
|
| 325 |
+
)
|
| 326 |
+
joint_ref = joint(encoder_ref, decoder_ref)
|
| 327 |
+
|
| 328 |
+
decoder_ref = decoder_ref.clone()
|
| 329 |
+
h_ref = h_ref.clone()
|
| 330 |
+
c_ref = c_ref.clone()
|
| 331 |
+
joint_ref = joint_ref.clone()
|
| 332 |
+
|
| 333 |
+
typer.echo(f"Encoder output shape: {encoder_ref.shape}")
|
| 334 |
+
typer.echo(f"Decoder output shape: {decoder_ref.shape}")
|
| 335 |
+
typer.echo(f"Joint output shape: {joint_ref.shape}")
|
| 336 |
+
|
| 337 |
+
# === Export Preprocessor ===
|
| 338 |
+
typer.echo("Tracing and converting preprocessor…")
|
| 339 |
+
preprocessor = preprocessor.cpu()
|
| 340 |
+
audio_tensor = audio_tensor.cpu()
|
| 341 |
+
audio_length = audio_length.cpu()
|
| 342 |
+
traced_preprocessor = torch.jit.trace(
|
| 343 |
+
preprocessor, (audio_tensor, audio_length), strict=False
|
| 344 |
+
)
|
| 345 |
+
traced_preprocessor.eval()
|
| 346 |
+
preprocessor_inputs = [
|
| 347 |
+
ct.TensorType(
|
| 348 |
+
name="audio_signal",
|
| 349 |
+
shape=(1, ct.RangeDim(1, max_samples)),
|
| 350 |
+
dtype=np.float32,
|
| 351 |
+
),
|
| 352 |
+
ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
|
| 353 |
+
]
|
| 354 |
+
preprocessor_outputs = [
|
| 355 |
+
ct.TensorType(name="mel", dtype=np.float32),
|
| 356 |
+
ct.TensorType(name="mel_length", dtype=np.int32),
|
| 357 |
+
]
|
| 358 |
+
preprocessor_model = _coreml_convert(
|
| 359 |
+
traced_preprocessor,
|
| 360 |
+
preprocessor_inputs,
|
| 361 |
+
preprocessor_outputs,
|
| 362 |
+
export_settings,
|
| 363 |
+
compute_units_override=pre_cu,
|
| 364 |
+
)
|
| 365 |
+
preprocessor_path = output_dir / "parakeet_eou_preprocessor.mlpackage"
|
| 366 |
+
_save_mlpackage(
|
| 367 |
+
preprocessor_model,
|
| 368 |
+
preprocessor_path,
|
| 369 |
+
f"Parakeet EOU preprocessor ({max_audio_seconds}s window)",
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
# === Export Encoder ===
|
| 373 |
+
typer.echo("Tracing and converting encoder…")
|
| 374 |
+
traced_encoder = torch.jit.trace(
|
| 375 |
+
encoder, (mel_ref, mel_length_ref), strict=False
|
| 376 |
+
)
|
| 377 |
+
traced_encoder.eval()
|
| 378 |
+
encoder_inputs = [
|
| 379 |
+
ct.TensorType(
|
| 380 |
+
name="mel", shape=_tensor_shape(mel_ref), dtype=np.float32
|
| 381 |
+
),
|
| 382 |
+
ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
|
| 383 |
+
]
|
| 384 |
+
encoder_outputs = [
|
| 385 |
+
ct.TensorType(name="encoder", dtype=np.float32),
|
| 386 |
+
ct.TensorType(name="encoder_length", dtype=np.int32),
|
| 387 |
+
ct.TensorType(name="frame_times", dtype=np.float32),
|
| 388 |
+
]
|
| 389 |
+
encoder_model = _coreml_convert(
|
| 390 |
+
traced_encoder,
|
| 391 |
+
encoder_inputs,
|
| 392 |
+
encoder_outputs,
|
| 393 |
+
export_settings,
|
| 394 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 395 |
+
)
|
| 396 |
+
encoder_path = output_dir / "parakeet_eou_encoder.mlpackage"
|
| 397 |
+
_save_mlpackage(
|
| 398 |
+
encoder_model,
|
| 399 |
+
encoder_path,
|
| 400 |
+
f"Parakeet EOU encoder ({max_audio_seconds}s window)",
|
| 401 |
+
)
|
| 402 |
+
|
| 403 |
+
# === Export Fused Mel+Encoder ===
|
| 404 |
+
typer.echo("Tracing and converting fused mel+encoder…")
|
| 405 |
+
mel_encoder = MelEncoderWrapper(preprocessor, encoder)
|
| 406 |
+
traced_mel_encoder = torch.jit.trace(
|
| 407 |
+
mel_encoder, (audio_tensor, audio_length), strict=False
|
| 408 |
+
)
|
| 409 |
+
traced_mel_encoder.eval()
|
| 410 |
+
mel_encoder_inputs = [
|
| 411 |
+
ct.TensorType(
|
| 412 |
+
name="audio_signal", shape=(1, max_samples), dtype=np.float32
|
| 413 |
+
),
|
| 414 |
+
ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
|
| 415 |
+
]
|
| 416 |
+
mel_encoder_outputs = [
|
| 417 |
+
ct.TensorType(name="encoder", dtype=np.float32),
|
| 418 |
+
ct.TensorType(name="encoder_length", dtype=np.int32),
|
| 419 |
+
ct.TensorType(name="frame_times", dtype=np.float32),
|
| 420 |
+
]
|
| 421 |
+
mel_encoder_model = _coreml_convert(
|
| 422 |
+
traced_mel_encoder,
|
| 423 |
+
mel_encoder_inputs,
|
| 424 |
+
mel_encoder_outputs,
|
| 425 |
+
export_settings,
|
| 426 |
+
compute_units_override=melenc_cu,
|
| 427 |
+
)
|
| 428 |
+
mel_encoder_path = output_dir / "parakeet_eou_mel_encoder.mlpackage"
|
| 429 |
+
_save_mlpackage(
|
| 430 |
+
mel_encoder_model,
|
| 431 |
+
mel_encoder_path,
|
| 432 |
+
f"Parakeet EOU fused Mel+Encoder ({max_audio_seconds}s window)",
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
# === Export Decoder ===
|
| 436 |
+
typer.echo("Tracing and converting decoder…")
|
| 437 |
+
traced_decoder = torch.jit.trace(
|
| 438 |
+
decoder,
|
| 439 |
+
(targets, target_lengths, zero_state, zero_state),
|
| 440 |
+
strict=False,
|
| 441 |
+
)
|
| 442 |
+
traced_decoder.eval()
|
| 443 |
+
decoder_inputs = [
|
| 444 |
+
ct.TensorType(
|
| 445 |
+
name="targets", shape=_tensor_shape(targets), dtype=np.int32
|
| 446 |
+
),
|
| 447 |
+
ct.TensorType(name="target_length", shape=(1,), dtype=np.int32),
|
| 448 |
+
ct.TensorType(
|
| 449 |
+
name="h_in", shape=_tensor_shape(zero_state), dtype=np.float32
|
| 450 |
+
),
|
| 451 |
+
ct.TensorType(
|
| 452 |
+
name="c_in", shape=_tensor_shape(zero_state), dtype=np.float32
|
| 453 |
+
),
|
| 454 |
+
]
|
| 455 |
+
decoder_outputs = [
|
| 456 |
+
ct.TensorType(name="decoder", dtype=np.float32),
|
| 457 |
+
ct.TensorType(name="h_out", dtype=np.float32),
|
| 458 |
+
ct.TensorType(name="c_out", dtype=np.float32),
|
| 459 |
+
]
|
| 460 |
+
decoder_model = _coreml_convert(
|
| 461 |
+
traced_decoder,
|
| 462 |
+
decoder_inputs,
|
| 463 |
+
decoder_outputs,
|
| 464 |
+
export_settings,
|
| 465 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 466 |
+
)
|
| 467 |
+
decoder_path = output_dir / "parakeet_eou_decoder.mlpackage"
|
| 468 |
+
_save_mlpackage(
|
| 469 |
+
decoder_model,
|
| 470 |
+
decoder_path,
|
| 471 |
+
"Parakeet EOU decoder (RNNT prediction network)",
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
# === Export Joint ===
|
| 475 |
+
typer.echo("Tracing and converting joint…")
|
| 476 |
+
traced_joint = torch.jit.trace(
|
| 477 |
+
joint,
|
| 478 |
+
(encoder_ref, decoder_ref),
|
| 479 |
+
strict=False,
|
| 480 |
+
)
|
| 481 |
+
traced_joint.eval()
|
| 482 |
+
joint_inputs = [
|
| 483 |
+
ct.TensorType(
|
| 484 |
+
name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32
|
| 485 |
+
),
|
| 486 |
+
ct.TensorType(
|
| 487 |
+
name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32
|
| 488 |
+
),
|
| 489 |
+
]
|
| 490 |
+
joint_outputs = [
|
| 491 |
+
ct.TensorType(name="logits", dtype=np.float32),
|
| 492 |
+
]
|
| 493 |
+
joint_model = _coreml_convert(
|
| 494 |
+
traced_joint,
|
| 495 |
+
joint_inputs,
|
| 496 |
+
joint_outputs,
|
| 497 |
+
export_settings,
|
| 498 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 499 |
+
)
|
| 500 |
+
joint_path = output_dir / "parakeet_eou_joint.mlpackage"
|
| 501 |
+
_save_mlpackage(
|
| 502 |
+
joint_model,
|
| 503 |
+
joint_path,
|
| 504 |
+
"Parakeet EOU joint network (RNNT)",
|
| 505 |
+
)
|
| 506 |
+
|
| 507 |
+
# === Export Joint Decision Head ===
|
| 508 |
+
typer.echo("Tracing and converting joint decision head…")
|
| 509 |
+
joint_decision = JointDecisionWrapper(joint, vocab_size=vocab_size)
|
| 510 |
+
traced_joint_decision = torch.jit.trace(
|
| 511 |
+
joint_decision,
|
| 512 |
+
(encoder_ref, decoder_ref),
|
| 513 |
+
strict=False,
|
| 514 |
+
)
|
| 515 |
+
traced_joint_decision.eval()
|
| 516 |
+
joint_decision_inputs = [
|
| 517 |
+
ct.TensorType(
|
| 518 |
+
name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32
|
| 519 |
+
),
|
| 520 |
+
ct.TensorType(
|
| 521 |
+
name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32
|
| 522 |
+
),
|
| 523 |
+
]
|
| 524 |
+
joint_decision_outputs = [
|
| 525 |
+
ct.TensorType(name="token_id", dtype=np.int32),
|
| 526 |
+
ct.TensorType(name="token_prob", dtype=np.float32),
|
| 527 |
+
]
|
| 528 |
+
joint_decision_model = _coreml_convert(
|
| 529 |
+
traced_joint_decision,
|
| 530 |
+
joint_decision_inputs,
|
| 531 |
+
joint_decision_outputs,
|
| 532 |
+
export_settings,
|
| 533 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 534 |
+
)
|
| 535 |
+
joint_decision_path = output_dir / "parakeet_eou_joint_decision.mlpackage"
|
| 536 |
+
_save_mlpackage(
|
| 537 |
+
joint_decision_model,
|
| 538 |
+
joint_decision_path,
|
| 539 |
+
"Parakeet EOU joint + decision head (softmax, argmax)",
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
# === Export Single-Step Joint Decision ===
|
| 543 |
+
typer.echo("Tracing and converting single-step joint decision…")
|
| 544 |
+
jd_single = JointDecisionSingleStep(joint, vocab_size=vocab_size)
|
| 545 |
+
# Create single-step slices from refs
|
| 546 |
+
enc_step = encoder_ref[:, :, :1].contiguous()
|
| 547 |
+
dec_step = decoder_ref[:, :, :1].contiguous()
|
| 548 |
+
traced_jd_single = torch.jit.trace(
|
| 549 |
+
jd_single,
|
| 550 |
+
(enc_step, dec_step),
|
| 551 |
+
strict=False,
|
| 552 |
+
)
|
| 553 |
+
traced_jd_single.eval()
|
| 554 |
+
jd_single_inputs = [
|
| 555 |
+
ct.TensorType(
|
| 556 |
+
name="encoder_step",
|
| 557 |
+
shape=(1, enc_step.shape[1], 1),
|
| 558 |
+
dtype=np.float32,
|
| 559 |
+
),
|
| 560 |
+
ct.TensorType(
|
| 561 |
+
name="decoder_step",
|
| 562 |
+
shape=(1, dec_step.shape[1], 1),
|
| 563 |
+
dtype=np.float32,
|
| 564 |
+
),
|
| 565 |
+
]
|
| 566 |
+
jd_single_outputs = [
|
| 567 |
+
ct.TensorType(name="token_id", dtype=np.int32),
|
| 568 |
+
ct.TensorType(name="token_prob", dtype=np.float32),
|
| 569 |
+
ct.TensorType(name="top_k_ids", dtype=np.int32),
|
| 570 |
+
ct.TensorType(name="top_k_logits", dtype=np.float32),
|
| 571 |
+
]
|
| 572 |
+
jd_single_model = _coreml_convert(
|
| 573 |
+
traced_jd_single,
|
| 574 |
+
jd_single_inputs,
|
| 575 |
+
jd_single_outputs,
|
| 576 |
+
export_settings,
|
| 577 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 578 |
+
)
|
| 579 |
+
jd_single_path = output_dir / "parakeet_eou_joint_decision_single_step.mlpackage"
|
| 580 |
+
_save_mlpackage(
|
| 581 |
+
jd_single_model,
|
| 582 |
+
jd_single_path,
|
| 583 |
+
"Parakeet EOU single-step joint decision (current frame)",
|
| 584 |
+
)
|
| 585 |
+
|
| 586 |
+
# === Save Metadata ===
|
| 587 |
+
metadata: Dict[str, object] = {
|
| 588 |
+
"model_id": model_id,
|
| 589 |
+
"model_name": "parakeet_realtime_eou_120m-v1",
|
| 590 |
+
"model_class": type(asr_model).__name__,
|
| 591 |
+
"encoder_class": type(asr_model.encoder).__name__,
|
| 592 |
+
"sample_rate": sample_rate,
|
| 593 |
+
"max_audio_seconds": export_settings.max_audio_seconds,
|
| 594 |
+
"max_audio_samples": max_samples,
|
| 595 |
+
"max_symbol_steps": export_settings.max_symbol_steps,
|
| 596 |
+
"vocab_size": vocab_size,
|
| 597 |
+
"vocab_with_blank": vocab_size + 1,
|
| 598 |
+
"decoder_hidden": decoder_hidden,
|
| 599 |
+
"decoder_layers": decoder_layers,
|
| 600 |
+
"num_extra_outputs": num_extra,
|
| 601 |
+
"has_eou_token": True,
|
| 602 |
+
"checkpoint": checkpoint_meta,
|
| 603 |
+
"coreml": {
|
| 604 |
+
"compute_units": export_settings.compute_units.name,
|
| 605 |
+
"compute_precision": (
|
| 606 |
+
export_settings.compute_precision.name
|
| 607 |
+
if export_settings.compute_precision is not None
|
| 608 |
+
else "FLOAT32"
|
| 609 |
+
),
|
| 610 |
+
},
|
| 611 |
+
"components": {
|
| 612 |
+
"preprocessor": {
|
| 613 |
+
"inputs": {
|
| 614 |
+
"audio_signal": [1, max_samples],
|
| 615 |
+
"audio_length": [1],
|
| 616 |
+
},
|
| 617 |
+
"outputs": {
|
| 618 |
+
"mel": list(_tensor_shape(mel_ref)),
|
| 619 |
+
"mel_length": [1],
|
| 620 |
+
},
|
| 621 |
+
"path": preprocessor_path.name,
|
| 622 |
+
},
|
| 623 |
+
"encoder": {
|
| 624 |
+
"inputs": {
|
| 625 |
+
"mel": list(_tensor_shape(mel_ref)),
|
| 626 |
+
"mel_length": [1],
|
| 627 |
+
},
|
| 628 |
+
"outputs": {
|
| 629 |
+
"encoder": list(_tensor_shape(encoder_ref)),
|
| 630 |
+
"encoder_length": [1],
|
| 631 |
+
"frame_times": [1, _tensor_shape(encoder_ref)[2]],
|
| 632 |
+
},
|
| 633 |
+
"path": encoder_path.name,
|
| 634 |
+
},
|
| 635 |
+
"mel_encoder": {
|
| 636 |
+
"inputs": {
|
| 637 |
+
"audio_signal": [1, max_samples],
|
| 638 |
+
"audio_length": [1],
|
| 639 |
+
},
|
| 640 |
+
"outputs": {
|
| 641 |
+
"encoder": list(_tensor_shape(encoder_ref)),
|
| 642 |
+
"encoder_length": [1],
|
| 643 |
+
"frame_times": [1, _tensor_shape(encoder_ref)[2]],
|
| 644 |
+
},
|
| 645 |
+
"path": mel_encoder_path.name,
|
| 646 |
+
},
|
| 647 |
+
"decoder": {
|
| 648 |
+
"inputs": {
|
| 649 |
+
"targets": list(_tensor_shape(targets)),
|
| 650 |
+
"target_length": [1],
|
| 651 |
+
"h_in": list(_tensor_shape(zero_state)),
|
| 652 |
+
"c_in": list(_tensor_shape(zero_state)),
|
| 653 |
+
},
|
| 654 |
+
"outputs": {
|
| 655 |
+
"decoder": list(_tensor_shape(decoder_ref)),
|
| 656 |
+
"h_out": list(_tensor_shape(h_ref)),
|
| 657 |
+
"c_out": list(_tensor_shape(c_ref)),
|
| 658 |
+
},
|
| 659 |
+
"path": decoder_path.name,
|
| 660 |
+
},
|
| 661 |
+
"joint": {
|
| 662 |
+
"inputs": {
|
| 663 |
+
"encoder": list(_tensor_shape(encoder_ref)),
|
| 664 |
+
"decoder": list(_tensor_shape(decoder_ref)),
|
| 665 |
+
},
|
| 666 |
+
"outputs": {
|
| 667 |
+
"logits": list(_tensor_shape(joint_ref)),
|
| 668 |
+
},
|
| 669 |
+
"path": joint_path.name,
|
| 670 |
+
},
|
| 671 |
+
"joint_decision": {
|
| 672 |
+
"inputs": {
|
| 673 |
+
"encoder": list(_tensor_shape(encoder_ref)),
|
| 674 |
+
"decoder": list(_tensor_shape(decoder_ref)),
|
| 675 |
+
},
|
| 676 |
+
"outputs": {
|
| 677 |
+
"token_id": [
|
| 678 |
+
_tensor_shape(encoder_ref)[0],
|
| 679 |
+
_tensor_shape(encoder_ref)[2],
|
| 680 |
+
_tensor_shape(decoder_ref)[2],
|
| 681 |
+
],
|
| 682 |
+
"token_prob": [
|
| 683 |
+
_tensor_shape(encoder_ref)[0],
|
| 684 |
+
_tensor_shape(encoder_ref)[2],
|
| 685 |
+
_tensor_shape(decoder_ref)[2],
|
| 686 |
+
],
|
| 687 |
+
},
|
| 688 |
+
"path": joint_decision_path.name,
|
| 689 |
+
},
|
| 690 |
+
"joint_decision_single_step": {
|
| 691 |
+
"inputs": {
|
| 692 |
+
"encoder_step": [1, _tensor_shape(encoder_ref)[1], 1],
|
| 693 |
+
"decoder_step": [1, _tensor_shape(decoder_ref)[1], 1],
|
| 694 |
+
},
|
| 695 |
+
"outputs": {
|
| 696 |
+
"token_id": [1, 1, 1],
|
| 697 |
+
"token_prob": [1, 1, 1],
|
| 698 |
+
"top_k_ids": [1, 1, 1, 64],
|
| 699 |
+
"top_k_logits": [1, 1, 1, 64],
|
| 700 |
+
},
|
| 701 |
+
"path": jd_single_path.name,
|
| 702 |
+
},
|
| 703 |
+
},
|
| 704 |
+
}
|
| 705 |
+
|
| 706 |
+
# Export tokenizer vocab if available
|
| 707 |
+
try:
|
| 708 |
+
tokenizer = asr_model.tokenizer
|
| 709 |
+
vocab = {
|
| 710 |
+
"blank_id": int(asr_model.decoder.blank_idx),
|
| 711 |
+
"vocab_size": vocab_size,
|
| 712 |
+
}
|
| 713 |
+
# Try to get special tokens
|
| 714 |
+
if hasattr(tokenizer, "tokenizer"):
|
| 715 |
+
inner_tokenizer = tokenizer.tokenizer
|
| 716 |
+
if hasattr(inner_tokenizer, "get_vocab"):
|
| 717 |
+
full_vocab = inner_tokenizer.get_vocab()
|
| 718 |
+
# Find EOU token
|
| 719 |
+
eou_token = None
|
| 720 |
+
for token, idx in full_vocab.items():
|
| 721 |
+
if "<EOU>" in token.upper() or "eou" in token.lower():
|
| 722 |
+
eou_token = {"token": token, "id": idx}
|
| 723 |
+
break
|
| 724 |
+
if eou_token:
|
| 725 |
+
vocab["eou_token"] = eou_token
|
| 726 |
+
metadata["tokenizer"] = vocab
|
| 727 |
+
except Exception as e:
|
| 728 |
+
typer.echo(f"Warning: Could not export tokenizer info: {e}")
|
| 729 |
+
|
| 730 |
+
metadata_path = output_dir / "metadata.json"
|
| 731 |
+
metadata_path.write_text(json.dumps(metadata, indent=2))
|
| 732 |
+
typer.echo(f"\nExport complete. Metadata written to {metadata_path}")
|
| 733 |
+
typer.echo(f"Output directory: {output_dir}")
|
| 734 |
+
|
| 735 |
+
finally:
|
| 736 |
+
asr_model.decoder._rnnt_export = decoder_export_flag
|
| 737 |
+
|
| 738 |
+
|
| 739 |
+
if __name__ == "__main__":
|
| 740 |
+
app()
|
160ms/convert_streaming_encoder.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import coremltools as ct
|
| 5 |
+
import numpy as np
|
| 6 |
+
import typer
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Tuple, List, Optional
|
| 9 |
+
import json
|
| 10 |
+
import shutil
|
| 11 |
+
|
| 12 |
+
# Iimport torch
|
| 13 |
+
import coremltools as ct
|
| 14 |
+
import numpy as np
|
| 15 |
+
import argparse
|
| 16 |
+
from nemo.collections.asr.models import EncDecRNNTBPEModel
|
| 17 |
+
|
| 18 |
+
app = typer.Typer()
|
| 19 |
+
|
| 20 |
+
class LoopbackEncoderWrapper(nn.Module):
|
| 21 |
+
"""
|
| 22 |
+
Wraps the entire Parakeet Encoder (PreEncode + Conformer) for CoreML Loopback Streaming.
|
| 23 |
+
|
| 24 |
+
Inputs:
|
| 25 |
+
- audio_signal: [B, D, T] (Mel spectrogram chunk)
|
| 26 |
+
- audio_length: [B]
|
| 27 |
+
- pre_cache: [B, D, pre_cache_size] (Previous audio context)
|
| 28 |
+
- cache_last_channel: [layers, B, cache_size, hidden]
|
| 29 |
+
- cache_last_time: [layers, B, hidden, time_cache]
|
| 30 |
+
- cache_last_channel_len: [B]
|
| 31 |
+
|
| 32 |
+
Outputs:
|
| 33 |
+
- encoded_output: [B, D_out, T_out]
|
| 34 |
+
- encoded_length: [B]
|
| 35 |
+
- new_pre_cache: [B, D, pre_cache_size]
|
| 36 |
+
- new_cache_last_channel
|
| 37 |
+
- new_cache_last_time
|
| 38 |
+
- new_cache_last_channel_len
|
| 39 |
+
"""
|
| 40 |
+
def __init__(self, encoder, pre_cache_size=16):
|
| 41 |
+
super().__init__()
|
| 42 |
+
self.encoder = encoder
|
| 43 |
+
self.pre_cache_size = pre_cache_size
|
| 44 |
+
|
| 45 |
+
def forward(
|
| 46 |
+
self,
|
| 47 |
+
audio_signal: torch.Tensor,
|
| 48 |
+
audio_length: torch.Tensor,
|
| 49 |
+
pre_cache: torch.Tensor,
|
| 50 |
+
cache_last_channel: torch.Tensor,
|
| 51 |
+
cache_last_time: torch.Tensor,
|
| 52 |
+
cache_last_channel_len: torch.Tensor
|
| 53 |
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 54 |
+
|
| 55 |
+
# 1. Prepend pre_cache to audio_signal
|
| 56 |
+
# audio_signal: [B, D, T]
|
| 57 |
+
# pre_cache: [B, D, T_cache]
|
| 58 |
+
full_input = torch.cat([pre_cache, audio_signal], dim=2)
|
| 59 |
+
full_length = audio_length + self.pre_cache_size
|
| 60 |
+
|
| 61 |
+
# 2. Extract NEW pre_cache (last N frames of full_input)
|
| 62 |
+
# Note: We do this BEFORE processing because we want the raw audio context
|
| 63 |
+
new_pre_cache = full_input[:, :, -self.pre_cache_size:]
|
| 64 |
+
|
| 65 |
+
# 3. Process with Encoder
|
| 66 |
+
# Reconstruct NeMo cache object
|
| 67 |
+
current_cache = [cache_last_channel, cache_last_time, cache_last_channel_len]
|
| 68 |
+
|
| 69 |
+
encoded, encoded_len, new_cache_channel, new_cache_time, new_cache_len = self.encoder.cache_aware_stream_step(
|
| 70 |
+
processed_signal=full_input,
|
| 71 |
+
processed_signal_length=full_length,
|
| 72 |
+
cache_last_channel=cache_last_channel,
|
| 73 |
+
cache_last_time=cache_last_time,
|
| 74 |
+
cache_last_channel_len=cache_last_channel_len
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# 4. Drop the first few frames corresponding to pre_cache?
|
| 78 |
+
# NeMo's cache_aware_stream_step usually handles the "valid" output frames.
|
| 79 |
+
# But since we manually prepended, we might get extra output frames.
|
| 80 |
+
# However, for streaming, we usually want the model to see the context but only output the new tokens.
|
| 81 |
+
# Let's trust NeMo's streaming logic for now, or check if we need to slice.
|
| 82 |
+
# Given we are using 'cache_aware_stream_step', it expects the full context window?
|
| 83 |
+
# Actually, standard usage is: input IS the new chunk, but internal convolution looks at past.
|
| 84 |
+
# But since we are stateless, we MUST provide the past.
|
| 85 |
+
# So passing (pre_cache + chunk) is correct.
|
| 86 |
+
|
| 87 |
+
# Cast lengths to Int32 for CoreML
|
| 88 |
+
encoded_len_32 = encoded_len.to(dtype=torch.int32)
|
| 89 |
+
new_channel_len_32 = new_cache_len.to(dtype=torch.int32)
|
| 90 |
+
|
| 91 |
+
return encoded, encoded_len_32, new_pre_cache, new_cache_channel, new_cache_time, new_channel_len_32
|
| 92 |
+
|
| 93 |
+
def _coreml_convert(
|
| 94 |
+
traced_model,
|
| 95 |
+
inputs,
|
| 96 |
+
outputs,
|
| 97 |
+
compute_units=ct.ComputeUnit.CPU_ONLY
|
| 98 |
+
):
|
| 99 |
+
return ct.convert(
|
| 100 |
+
traced_model,
|
| 101 |
+
inputs=inputs,
|
| 102 |
+
outputs=outputs,
|
| 103 |
+
compute_units=compute_units,
|
| 104 |
+
minimum_deployment_target=ct.target.macOS14,
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
def main():
|
| 108 |
+
model_id: str = "nvidia/parakeet_realtime_eou_120m-v1"
|
| 109 |
+
output_dir: str = "temp_swift_models/StreamingLoopback"
|
| 110 |
+
output_path = Path(output_dir)
|
| 111 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
| 112 |
+
|
| 113 |
+
print(f"Loading model: {model_id}...")
|
| 114 |
+
asr_model = EncDecRNNTBPEModel.from_pretrained(model_name=model_id)
|
| 115 |
+
asr_model.eval()
|
| 116 |
+
|
| 117 |
+
parser = argparse.ArgumentParser()
|
| 118 |
+
parser.add_argument("--chunk-frames", type=int, default=17, help="Number of frames in the input chunk (e.g. 17 for 160ms, 129 for 1.28s)")
|
| 119 |
+
args = parser.parse_args()
|
| 120 |
+
|
| 121 |
+
encoder = asr_model.encoder
|
| 122 |
+
|
| 123 |
+
# --- Configuration ---
|
| 124 |
+
# 160ms chunk = 16 frames (but preprocessor produces 17 with padding/centering)
|
| 125 |
+
# 1.28s chunk = 128 frames (preprocessor produces 129)
|
| 126 |
+
chunk_size_in = args.chunk_frames
|
| 127 |
+
mel_dim = 128
|
| 128 |
+
hidden_dim = encoder.d_model # 512
|
| 129 |
+
num_layers = len(encoder.layers) # 17
|
| 130 |
+
|
| 131 |
+
# Cache sizes
|
| 132 |
+
cache_channel_size = 70
|
| 133 |
+
cache_time_size = 8
|
| 134 |
+
pre_cache_size = 16
|
| 135 |
+
|
| 136 |
+
print(f"Config: Chunk={chunk_size_in}, Mel={mel_dim}, Hidden={hidden_dim}, Layers={num_layers}")
|
| 137 |
+
print(f"Cache: Channel={cache_channel_size}, Time={cache_time_size}, Pre={pre_cache_size}")
|
| 138 |
+
|
| 139 |
+
# --- Wrapper ---
|
| 140 |
+
wrapper = LoopbackEncoderWrapper(encoder, pre_cache_size=pre_cache_size)
|
| 141 |
+
wrapper.eval()
|
| 142 |
+
|
| 143 |
+
# --- Test Inputs (for Tracing) ---
|
| 144 |
+
batch_size = 1
|
| 145 |
+
test_mel = torch.randn(batch_size, mel_dim, chunk_size_in)
|
| 146 |
+
test_mel_len = torch.tensor([chunk_size_in], dtype=torch.int32)
|
| 147 |
+
test_pre_cache = torch.zeros(batch_size, mel_dim, pre_cache_size)
|
| 148 |
+
|
| 149 |
+
# Initial Cache (Zeros)
|
| 150 |
+
test_cache_channel = torch.zeros(num_layers, batch_size, cache_channel_size, hidden_dim)
|
| 151 |
+
test_cache_time = torch.zeros(num_layers, batch_size, hidden_dim, cache_time_size)
|
| 152 |
+
test_cache_len = torch.zeros(batch_size, dtype=torch.int32)
|
| 153 |
+
|
| 154 |
+
print("Tracing model...")
|
| 155 |
+
traced_model = torch.jit.trace(
|
| 156 |
+
wrapper,
|
| 157 |
+
(test_mel, test_mel_len, test_pre_cache, test_cache_channel, test_cache_time, test_cache_len),
|
| 158 |
+
strict=False
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# --- CoreML Conversion ---
|
| 162 |
+
print("Converting to CoreML...")
|
| 163 |
+
|
| 164 |
+
inputs = [
|
| 165 |
+
ct.TensorType(name="audio_signal", shape=(1, 128, chunk_size_in), dtype=np.float32),
|
| 166 |
+
ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
|
| 167 |
+
ct.TensorType(name="pre_cache", shape=(1, 128, pre_cache_size), dtype=np.float32),
|
| 168 |
+
ct.TensorType(name="cache_last_channel", shape=(num_layers, 1, cache_channel_size, hidden_dim), dtype=np.float32),
|
| 169 |
+
ct.TensorType(name="cache_last_time", shape=(num_layers, 1, hidden_dim, cache_time_size), dtype=np.float32),
|
| 170 |
+
ct.TensorType(name="cache_last_channel_len", shape=(1,), dtype=np.int32),
|
| 171 |
+
]
|
| 172 |
+
|
| 173 |
+
outputs = [
|
| 174 |
+
ct.TensorType(name="encoded_output", dtype=np.float32),
|
| 175 |
+
ct.TensorType(name="encoded_length", dtype=np.int32),
|
| 176 |
+
ct.TensorType(name="new_pre_cache", dtype=np.float32),
|
| 177 |
+
ct.TensorType(name="new_cache_last_channel", dtype=np.float32),
|
| 178 |
+
ct.TensorType(name="new_cache_last_time", dtype=np.float32),
|
| 179 |
+
ct.TensorType(name="new_cache_last_channel_len", dtype=np.int32),
|
| 180 |
+
]
|
| 181 |
+
|
| 182 |
+
mlmodel = _coreml_convert(traced_model, inputs, outputs)
|
| 183 |
+
|
| 184 |
+
save_path = output_path / "streaming_encoder.mlpackage"
|
| 185 |
+
mlmodel.save(str(save_path))
|
| 186 |
+
print(f"Saved: {save_path}")
|
| 187 |
+
|
| 188 |
+
# Also export Preprocessor, Decoder, Joint for completeness?
|
| 189 |
+
# For now, let's assume we reuse the existing ones or export them separately if needed.
|
| 190 |
+
# But the user asked specifically for the Encoder loopback.
|
| 191 |
+
|
| 192 |
+
if __name__ == "__main__":
|
| 193 |
+
main()
|
160ms/decoder.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3996975a8cbc1949159c55605b3132b39b2484f51acbd55d796d93c70de02b49
|
| 3 |
+
size 243
|
160ms/decoder.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3ccbff963d8cf07e2be2bd56ea3384a89ea49628922c6bd95ff62e2ae57dc34
|
| 3 |
+
size 497
|
160ms/decoder.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"shortDescription" : "Parakeet EOU decoder (RNNT prediction network)",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32 1 × 640 × 1)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 640, 1]",
|
| 13 |
+
"name" : "decoder",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Float32",
|
| 20 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 640)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[1, 1, 640]",
|
| 23 |
+
"name" : "h_out",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"hasShapeFlexibility" : "0",
|
| 28 |
+
"isOptional" : "0",
|
| 29 |
+
"dataType" : "Float32",
|
| 30 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 640)",
|
| 31 |
+
"shortDescription" : "",
|
| 32 |
+
"shape" : "[1, 1, 640]",
|
| 33 |
+
"name" : "c_out",
|
| 34 |
+
"type" : "MultiArray"
|
| 35 |
+
}
|
| 36 |
+
],
|
| 37 |
+
"storagePrecision" : "Float16",
|
| 38 |
+
"modelParameters" : [
|
| 39 |
+
|
| 40 |
+
],
|
| 41 |
+
"author" : "Fluid Inference",
|
| 42 |
+
"specificationVersion" : 8,
|
| 43 |
+
"mlProgramOperationTypeHistogram" : {
|
| 44 |
+
"Ios17.squeeze" : 2,
|
| 45 |
+
"Ios17.gather" : 1,
|
| 46 |
+
"Ios17.cast" : 6,
|
| 47 |
+
"Ios17.lstm" : 1,
|
| 48 |
+
"Ios17.transpose" : 2,
|
| 49 |
+
"Identity" : 1,
|
| 50 |
+
"Ios17.expandDims" : 2
|
| 51 |
+
},
|
| 52 |
+
"computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
|
| 53 |
+
"isUpdatable" : "0",
|
| 54 |
+
"stateSchema" : [
|
| 55 |
+
|
| 56 |
+
],
|
| 57 |
+
"availability" : {
|
| 58 |
+
"macOS" : "14.0",
|
| 59 |
+
"tvOS" : "17.0",
|
| 60 |
+
"visionOS" : "1.0",
|
| 61 |
+
"watchOS" : "10.0",
|
| 62 |
+
"iOS" : "17.0",
|
| 63 |
+
"macCatalyst" : "17.0"
|
| 64 |
+
},
|
| 65 |
+
"modelType" : {
|
| 66 |
+
"name" : "MLModelType_mlProgram"
|
| 67 |
+
},
|
| 68 |
+
"inputSchema" : [
|
| 69 |
+
{
|
| 70 |
+
"hasShapeFlexibility" : "0",
|
| 71 |
+
"isOptional" : "0",
|
| 72 |
+
"dataType" : "Int32",
|
| 73 |
+
"formattedType" : "MultiArray (Int32 1 × 1)",
|
| 74 |
+
"shortDescription" : "",
|
| 75 |
+
"shape" : "[1, 1]",
|
| 76 |
+
"name" : "targets",
|
| 77 |
+
"type" : "MultiArray"
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"hasShapeFlexibility" : "0",
|
| 81 |
+
"isOptional" : "0",
|
| 82 |
+
"dataType" : "Int32",
|
| 83 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 84 |
+
"shortDescription" : "",
|
| 85 |
+
"shape" : "[1]",
|
| 86 |
+
"name" : "target_length",
|
| 87 |
+
"type" : "MultiArray"
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"hasShapeFlexibility" : "0",
|
| 91 |
+
"isOptional" : "0",
|
| 92 |
+
"dataType" : "Float32",
|
| 93 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 640)",
|
| 94 |
+
"shortDescription" : "",
|
| 95 |
+
"shape" : "[1, 1, 640]",
|
| 96 |
+
"name" : "h_in",
|
| 97 |
+
"type" : "MultiArray"
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"hasShapeFlexibility" : "0",
|
| 101 |
+
"isOptional" : "0",
|
| 102 |
+
"dataType" : "Float32",
|
| 103 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 640)",
|
| 104 |
+
"shortDescription" : "",
|
| 105 |
+
"shape" : "[1, 1, 640]",
|
| 106 |
+
"name" : "c_in",
|
| 107 |
+
"type" : "MultiArray"
|
| 108 |
+
}
|
| 109 |
+
],
|
| 110 |
+
"userDefinedMetadata" : {
|
| 111 |
+
"com.github.apple.coremltools.version" : "8.3.0",
|
| 112 |
+
"com.github.apple.coremltools.source" : "torch==2.4.0",
|
| 113 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript"
|
| 114 |
+
},
|
| 115 |
+
"generatedClassName" : "parakeet_eou_decoder",
|
| 116 |
+
"method" : "predict"
|
| 117 |
+
}
|
| 118 |
+
]
|
160ms/decoder.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios17>(tensor<fp32, [1, 1, 640]> c_in, tensor<fp32, [1, 1, 640]> h_in, tensor<int32, [1]> target_length, tensor<int32, [1, 1]> targets) {
|
| 5 |
+
tensor<int32, []> y_axis_0 = const()[name = tensor<string, []>("y_axis_0"), val = tensor<int32, []>(0)];
|
| 6 |
+
tensor<int32, []> y_batch_dims_0 = const()[name = tensor<string, []>("y_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 7 |
+
tensor<bool, []> y_validate_indices_0 = const()[name = tensor<string, []>("y_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 8 |
+
tensor<fp16, [1027, 640]> module_prediction_embed_weight_to_fp16 = const()[name = tensor<string, []>("module_prediction_embed_weight_to_fp16"), val = tensor<fp16, [1027, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 9 |
+
tensor<string, []> targets_to_int16_dtype_0 = const()[name = tensor<string, []>("targets_to_int16_dtype_0"), val = tensor<string, []>("int16")];
|
| 10 |
+
tensor<int16, [1, 1]> targets_to_int16 = cast(dtype = targets_to_int16_dtype_0, x = targets)[name = tensor<string, []>("cast_8")];
|
| 11 |
+
tensor<fp16, [1, 1, 640]> y_cast_fp16_cast_uint16 = gather(axis = y_axis_0, batch_dims = y_batch_dims_0, indices = targets_to_int16, validate_indices = y_validate_indices_0, x = module_prediction_embed_weight_to_fp16)[name = tensor<string, []>("y_cast_fp16_cast_uint16")];
|
| 12 |
+
tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
|
| 13 |
+
tensor<int32, [1]> input_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
|
| 14 |
+
tensor<string, []> h_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("h_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 15 |
+
tensor<fp16, [1, 1, 640]> h_in_to_fp16 = cast(dtype = h_in_to_fp16_dtype_0, x = h_in)[name = tensor<string, []>("cast_7")];
|
| 16 |
+
tensor<fp16, [1, 640]> input_lstm_h0_squeeze_cast_fp16 = squeeze(axes = input_lstm_h0_squeeze_axes_0, x = h_in_to_fp16)[name = tensor<string, []>("input_lstm_h0_squeeze_cast_fp16")];
|
| 17 |
+
tensor<int32, [1]> input_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
|
| 18 |
+
tensor<string, []> c_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("c_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 19 |
+
tensor<fp16, [1, 1, 640]> c_in_to_fp16 = cast(dtype = c_in_to_fp16_dtype_0, x = c_in)[name = tensor<string, []>("cast_6")];
|
| 20 |
+
tensor<fp16, [1, 640]> input_lstm_c0_squeeze_cast_fp16 = squeeze(axes = input_lstm_c0_squeeze_axes_0, x = c_in_to_fp16)[name = tensor<string, []>("input_lstm_c0_squeeze_cast_fp16")];
|
| 21 |
+
tensor<string, []> input_direction_0 = const()[name = tensor<string, []>("input_direction_0"), val = tensor<string, []>("forward")];
|
| 22 |
+
tensor<bool, []> input_output_sequence_0 = const()[name = tensor<string, []>("input_output_sequence_0"), val = tensor<bool, []>(true)];
|
| 23 |
+
tensor<string, []> input_recurrent_activation_0 = const()[name = tensor<string, []>("input_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
|
| 24 |
+
tensor<string, []> input_cell_activation_0 = const()[name = tensor<string, []>("input_cell_activation_0"), val = tensor<string, []>("tanh")];
|
| 25 |
+
tensor<string, []> input_activation_0 = const()[name = tensor<string, []>("input_activation_0"), val = tensor<string, []>("tanh")];
|
| 26 |
+
tensor<fp16, [2560, 640]> concat_1_to_fp16 = const()[name = tensor<string, []>("concat_1_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1314688)))];
|
| 27 |
+
tensor<fp16, [2560, 640]> concat_2_to_fp16 = const()[name = tensor<string, []>("concat_2_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4591552)))];
|
| 28 |
+
tensor<fp16, [2560]> concat_0_to_fp16 = const()[name = tensor<string, []>("concat_0_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7868416)))];
|
| 29 |
+
tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = y_cast_fp16_cast_uint16)[name = tensor<string, []>("transpose_2")];
|
| 30 |
+
tensor<fp16, [1, 1, 640]> input_cast_fp16_0, tensor<fp16, [1, 640]> input_cast_fp16_1, tensor<fp16, [1, 640]> input_cast_fp16_2 = lstm(activation = input_activation_0, bias = concat_0_to_fp16, cell_activation = input_cell_activation_0, direction = input_direction_0, initial_c = input_lstm_c0_squeeze_cast_fp16, initial_h = input_lstm_h0_squeeze_cast_fp16, output_sequence = input_output_sequence_0, recurrent_activation = input_recurrent_activation_0, weight_hh = concat_2_to_fp16, weight_ih = concat_1_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
|
| 31 |
+
tensor<int32, [1]> obj_3_axes_0 = const()[name = tensor<string, []>("obj_3_axes_0"), val = tensor<int32, [1]>([0])];
|
| 32 |
+
tensor<fp16, [1, 1, 640]> obj_3_cast_fp16 = expand_dims(axes = obj_3_axes_0, x = input_cast_fp16_1)[name = tensor<string, []>("obj_3_cast_fp16")];
|
| 33 |
+
tensor<string, []> obj_3_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_3_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 34 |
+
tensor<int32, [1]> obj_axes_0 = const()[name = tensor<string, []>("obj_axes_0"), val = tensor<int32, [1]>([0])];
|
| 35 |
+
tensor<fp16, [1, 1, 640]> obj_cast_fp16 = expand_dims(axes = obj_axes_0, x = input_cast_fp16_2)[name = tensor<string, []>("obj_cast_fp16")];
|
| 36 |
+
tensor<string, []> obj_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 37 |
+
tensor<int32, [3]> transpose_0_perm_0 = const()[name = tensor<string, []>("transpose_0_perm_0"), val = tensor<int32, [3]>([1, 2, 0])];
|
| 38 |
+
tensor<string, []> transpose_0_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("transpose_0_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 39 |
+
tensor<fp16, [1, 640, 1]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = input_cast_fp16_0)[name = tensor<string, []>("transpose_1")];
|
| 40 |
+
tensor<fp32, [1, 640, 1]> decoder = cast(dtype = transpose_0_cast_fp16_to_fp32_dtype_0, x = transpose_0_cast_fp16)[name = tensor<string, []>("cast_3")];
|
| 41 |
+
tensor<fp32, [1, 1, 640]> c_out = cast(dtype = obj_cast_fp16_to_fp32_dtype_0, x = obj_cast_fp16)[name = tensor<string, []>("cast_4")];
|
| 42 |
+
tensor<fp32, [1, 1, 640]> h_out = cast(dtype = obj_3_cast_fp16_to_fp32_dtype_0, x = obj_3_cast_fp16)[name = tensor<string, []>("cast_5")];
|
| 43 |
+
tensor<int32, [1]> target_length_tmp = identity(x = target_length)[name = tensor<string, []>("target_length_tmp")];
|
| 44 |
+
} -> (decoder, h_out, c_out);
|
| 45 |
+
}
|
160ms/decoder.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b4cacecdcd9df79ab1e56de67230baf5a8664d2afe0bb8f3408eefa972cb2f4
|
| 3 |
+
size 7873600
|
160ms/decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09f2dbd1f6a06faa6995f71d4b25d7c446996b6059cfac5ecc889853bdc7c6e5
|
| 3 |
+
size 6728
|
160ms/decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b4cacecdcd9df79ab1e56de67230baf5a8664d2afe0bb8f3408eefa972cb2f4
|
| 3 |
+
size 7873600
|
160ms/decoder.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"8201D73A-2B5D-488C-9C2B-7E2E75DF700D": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"F8EEBE8D-F17D-4556-B8DF-9BC11701B36D": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "F8EEBE8D-F17D-4556-B8DF-9BC11701B36D"
|
| 18 |
+
}
|
160ms/individual_components.py
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Export Parakeet Realtime EOU 120M RNNT components into CoreML."""
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Optional, Tuple
|
| 8 |
+
|
| 9 |
+
import coremltools as ct
|
| 10 |
+
import torch
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class ExportSettings:
|
| 15 |
+
output_dir: Path
|
| 16 |
+
compute_units: ct.ComputeUnit
|
| 17 |
+
deployment_target: Optional[ct.target]
|
| 18 |
+
compute_precision: Optional[ct.precision]
|
| 19 |
+
max_audio_seconds: float
|
| 20 |
+
max_symbol_steps: int
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class PreprocessorWrapper(torch.nn.Module):
|
| 24 |
+
"""Wrapper for the audio preprocessor (mel spectrogram extraction)."""
|
| 25 |
+
|
| 26 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 27 |
+
super().__init__()
|
| 28 |
+
self.module = module
|
| 29 |
+
|
| 30 |
+
def forward(
|
| 31 |
+
self, audio_signal: torch.Tensor, length: torch.Tensor
|
| 32 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 33 |
+
mel, mel_length = self.module(
|
| 34 |
+
input_signal=audio_signal, length=length.to(dtype=torch.long)
|
| 35 |
+
)
|
| 36 |
+
return mel, mel_length
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class EncoderWrapper(torch.nn.Module):
|
| 40 |
+
"""Wrapper for the cache-aware FastConformer encoder.
|
| 41 |
+
|
| 42 |
+
Note: For the realtime EOU model, the encoder is cache-aware which means
|
| 43 |
+
it can operate in a streaming fashion. For CoreML export, we export
|
| 44 |
+
without cache state for simplicity (full-context mode).
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 48 |
+
super().__init__()
|
| 49 |
+
self.module = module
|
| 50 |
+
|
| 51 |
+
def forward(
|
| 52 |
+
self, features: torch.Tensor, length: torch.Tensor
|
| 53 |
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 54 |
+
encoded, encoded_lengths = self.module(
|
| 55 |
+
audio_signal=features, length=length.to(dtype=torch.long)
|
| 56 |
+
)
|
| 57 |
+
# Synthesize per-frame timestamps (seconds) using the 80 ms encoder stride.
|
| 58 |
+
# Shape: [B, T_enc]
|
| 59 |
+
frame_times = (
|
| 60 |
+
torch.arange(encoded.shape[-1], device=encoded.device, dtype=torch.float32)
|
| 61 |
+
* 0.08
|
| 62 |
+
)
|
| 63 |
+
return encoded, encoded_lengths, frame_times
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class DecoderWrapper(torch.nn.Module):
|
| 67 |
+
"""Wrapper for the RNNT prediction network (decoder)."""
|
| 68 |
+
|
| 69 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 70 |
+
super().__init__()
|
| 71 |
+
self.module = module
|
| 72 |
+
|
| 73 |
+
def forward(
|
| 74 |
+
self,
|
| 75 |
+
targets: torch.Tensor,
|
| 76 |
+
target_lengths: torch.Tensor,
|
| 77 |
+
h_in: torch.Tensor,
|
| 78 |
+
c_in: torch.Tensor,
|
| 79 |
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 80 |
+
state = [h_in, c_in]
|
| 81 |
+
decoder_output, _, new_state = self.module(
|
| 82 |
+
targets=targets.to(dtype=torch.long),
|
| 83 |
+
target_length=target_lengths.to(dtype=torch.long),
|
| 84 |
+
states=state,
|
| 85 |
+
)
|
| 86 |
+
return decoder_output, new_state[0], new_state[1]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
class JointWrapper(torch.nn.Module):
|
| 90 |
+
"""Wrapper for the RNNT joint network.
|
| 91 |
+
|
| 92 |
+
Note: Unlike Parakeet TDT v3, the realtime EOU model does NOT have
|
| 93 |
+
duration outputs (num_extra_outputs). The joint network outputs only
|
| 94 |
+
token logits over the vocabulary + blank.
|
| 95 |
+
"""
|
| 96 |
+
|
| 97 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 98 |
+
super().__init__()
|
| 99 |
+
self.module = module
|
| 100 |
+
|
| 101 |
+
def forward(
|
| 102 |
+
self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor
|
| 103 |
+
) -> torch.Tensor:
|
| 104 |
+
# Input: encoder_outputs [B, D, T], decoder_outputs [B, D, U]
|
| 105 |
+
# Transpose to match what projection layers expect
|
| 106 |
+
encoder_outputs = encoder_outputs.transpose(1, 2) # [B, T, D]
|
| 107 |
+
decoder_outputs = decoder_outputs.transpose(1, 2) # [B, U, D]
|
| 108 |
+
|
| 109 |
+
# Apply projections
|
| 110 |
+
enc_proj = self.module.enc(encoder_outputs) # [B, T, joint_hidden]
|
| 111 |
+
dec_proj = self.module.pred(decoder_outputs) # [B, U, joint_hidden]
|
| 112 |
+
|
| 113 |
+
# Explicit broadcasting along T and U to avoid converter ambiguity
|
| 114 |
+
x = enc_proj.unsqueeze(2) + dec_proj.unsqueeze(1) # [B, T, U, joint_hidden]
|
| 115 |
+
x = self.module.joint_net[0](x) # ReLU
|
| 116 |
+
x = self.module.joint_net[1](x) # Dropout (no-op in eval)
|
| 117 |
+
out = self.module.joint_net[2](x) # Linear -> logits [B, T, U, vocab+blank]
|
| 118 |
+
return out
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class MelEncoderWrapper(torch.nn.Module):
|
| 122 |
+
"""Fused wrapper: waveform -> mel -> encoder.
|
| 123 |
+
|
| 124 |
+
Inputs:
|
| 125 |
+
- audio_signal: [B, S]
|
| 126 |
+
- audio_length: [B]
|
| 127 |
+
|
| 128 |
+
Outputs:
|
| 129 |
+
- encoder: [B, D, T_enc]
|
| 130 |
+
- encoder_length: [B]
|
| 131 |
+
- frame_times: [T_enc]
|
| 132 |
+
"""
|
| 133 |
+
|
| 134 |
+
def __init__(
|
| 135 |
+
self, preprocessor: PreprocessorWrapper, encoder: EncoderWrapper
|
| 136 |
+
) -> None:
|
| 137 |
+
super().__init__()
|
| 138 |
+
self.preprocessor = preprocessor
|
| 139 |
+
self.encoder = encoder
|
| 140 |
+
|
| 141 |
+
def forward(
|
| 142 |
+
self, audio_signal: torch.Tensor, audio_length: torch.Tensor
|
| 143 |
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 144 |
+
mel, mel_length = self.preprocessor(audio_signal, audio_length)
|
| 145 |
+
encoded, enc_len, frame_times = self.encoder(mel, mel_length.to(dtype=torch.int32))
|
| 146 |
+
return encoded, enc_len, frame_times
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class JointDecisionWrapper(torch.nn.Module):
|
| 150 |
+
"""Joint + decision head: outputs label id and label prob.
|
| 151 |
+
|
| 152 |
+
Unlike Parakeet TDT v3, this model does NOT have duration outputs.
|
| 153 |
+
|
| 154 |
+
Inputs:
|
| 155 |
+
- encoder_outputs: [B, D, T]
|
| 156 |
+
- decoder_outputs: [B, D, U]
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
- token_id: [B, T, U] int32
|
| 160 |
+
- token_prob: [B, T, U] float32
|
| 161 |
+
"""
|
| 162 |
+
|
| 163 |
+
def __init__(self, joint: JointWrapper, vocab_size: int) -> None:
|
| 164 |
+
super().__init__()
|
| 165 |
+
self.joint = joint
|
| 166 |
+
self.vocab_with_blank = int(vocab_size) + 1
|
| 167 |
+
|
| 168 |
+
def forward(self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor):
|
| 169 |
+
logits = self.joint(encoder_outputs, decoder_outputs)
|
| 170 |
+
|
| 171 |
+
# Token selection
|
| 172 |
+
token_ids = torch.argmax(logits, dim=-1).to(dtype=torch.int32)
|
| 173 |
+
token_probs_all = torch.softmax(logits, dim=-1)
|
| 174 |
+
# gather expects int64 (long) indices; cast only for gather
|
| 175 |
+
token_prob = torch.gather(
|
| 176 |
+
token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
|
| 177 |
+
).squeeze(-1)
|
| 178 |
+
|
| 179 |
+
return token_ids, token_prob
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
class JointDecisionSingleStep(torch.nn.Module):
|
| 183 |
+
"""Single-step variant for streaming: encoder_step -> token decision.
|
| 184 |
+
|
| 185 |
+
Inputs:
|
| 186 |
+
- encoder_step: [B=1, D, T=1]
|
| 187 |
+
- decoder_step: [B=1, D, U=1]
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
- token_id: [1, 1, 1] int32
|
| 191 |
+
- token_prob: [1, 1, 1] float32
|
| 192 |
+
- top_k_ids: [1, 1, 1, K] int32
|
| 193 |
+
- top_k_logits: [1, 1, 1, K] float32
|
| 194 |
+
"""
|
| 195 |
+
|
| 196 |
+
def __init__(self, joint: JointWrapper, vocab_size: int, top_k: int = 64) -> None:
|
| 197 |
+
super().__init__()
|
| 198 |
+
self.joint = joint
|
| 199 |
+
self.vocab_with_blank = int(vocab_size) + 1
|
| 200 |
+
self.top_k = int(top_k)
|
| 201 |
+
|
| 202 |
+
def forward(self, encoder_step: torch.Tensor, decoder_step: torch.Tensor):
|
| 203 |
+
# Reuse JointWrapper which expects [B, D, T] and [B, D, U]
|
| 204 |
+
logits = self.joint(encoder_step, decoder_step) # [1, 1, 1, V+blank]
|
| 205 |
+
|
| 206 |
+
token_ids = torch.argmax(logits, dim=-1, keepdim=False).to(dtype=torch.int32)
|
| 207 |
+
token_probs_all = torch.softmax(logits, dim=-1)
|
| 208 |
+
token_prob = torch.gather(
|
| 209 |
+
token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
|
| 210 |
+
).squeeze(-1)
|
| 211 |
+
|
| 212 |
+
# Also expose top-K candidates for host-side processing
|
| 213 |
+
topk_logits, topk_ids_long = torch.topk(
|
| 214 |
+
logits, k=min(self.top_k, logits.shape[-1]), dim=-1
|
| 215 |
+
)
|
| 216 |
+
topk_ids = topk_ids_long.to(dtype=torch.int32)
|
| 217 |
+
return token_ids, token_prob, topk_ids, topk_logits
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def _coreml_convert(
|
| 221 |
+
traced: torch.jit.ScriptModule,
|
| 222 |
+
inputs,
|
| 223 |
+
outputs,
|
| 224 |
+
settings: ExportSettings,
|
| 225 |
+
compute_units_override: Optional[ct.ComputeUnit] = None,
|
| 226 |
+
compute_precision: Optional[ct.precision] = None,
|
| 227 |
+
) -> ct.models.MLModel:
|
| 228 |
+
cu = (
|
| 229 |
+
compute_units_override
|
| 230 |
+
if compute_units_override is not None
|
| 231 |
+
else settings.compute_units
|
| 232 |
+
)
|
| 233 |
+
kwargs = {
|
| 234 |
+
"convert_to": "mlprogram",
|
| 235 |
+
"inputs": inputs,
|
| 236 |
+
"outputs": outputs,
|
| 237 |
+
"compute_units": cu,
|
| 238 |
+
}
|
| 239 |
+
print("Converting:", traced.__class__.__name__)
|
| 240 |
+
print("Conversion kwargs:", kwargs)
|
| 241 |
+
if settings.deployment_target is not None:
|
| 242 |
+
kwargs["minimum_deployment_target"] = settings.deployment_target
|
| 243 |
+
|
| 244 |
+
# Priority: explicit argument > settings
|
| 245 |
+
if compute_precision is not None:
|
| 246 |
+
kwargs["compute_precision"] = compute_precision
|
| 247 |
+
elif settings.compute_precision is not None:
|
| 248 |
+
kwargs["compute_precision"] = settings.compute_precision
|
| 249 |
+
|
| 250 |
+
return ct.convert(traced, **kwargs)
|
160ms/joint_decision.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5bca32ad130dcad6605cc00044c752aa5b45ef57d14c17f2d1a2fa49d6cf55b5
|
| 3 |
+
size 243
|
160ms/joint_decision.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:22d4abc4625b935ee035b5f8ce7cb28d1041b9b01c12173e287bf4b5f5d99625
|
| 3 |
+
size 493
|
160ms/joint_decision.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"shortDescription" : "Parakeet EOU single-step joint decision",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Int32",
|
| 10 |
+
"formattedType" : "MultiArray (Int32 1 × 1 × 1)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 1, 1]",
|
| 13 |
+
"name" : "token_id",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Float32",
|
| 20 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 1)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[1, 1, 1]",
|
| 23 |
+
"name" : "token_prob",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"hasShapeFlexibility" : "0",
|
| 28 |
+
"isOptional" : "0",
|
| 29 |
+
"dataType" : "Int32",
|
| 30 |
+
"formattedType" : "MultiArray (Int32 1 × 1 × 1 × 64)",
|
| 31 |
+
"shortDescription" : "",
|
| 32 |
+
"shape" : "[1, 1, 1, 64]",
|
| 33 |
+
"name" : "top_k_ids",
|
| 34 |
+
"type" : "MultiArray"
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"hasShapeFlexibility" : "0",
|
| 38 |
+
"isOptional" : "0",
|
| 39 |
+
"dataType" : "Float32",
|
| 40 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 1 × 64)",
|
| 41 |
+
"shortDescription" : "",
|
| 42 |
+
"shape" : "[1, 1, 1, 64]",
|
| 43 |
+
"name" : "top_k_logits",
|
| 44 |
+
"type" : "MultiArray"
|
| 45 |
+
}
|
| 46 |
+
],
|
| 47 |
+
"storagePrecision" : "Float16",
|
| 48 |
+
"modelParameters" : [
|
| 49 |
+
|
| 50 |
+
],
|
| 51 |
+
"author" : "Fluid Inference",
|
| 52 |
+
"specificationVersion" : 8,
|
| 53 |
+
"mlProgramOperationTypeHistogram" : {
|
| 54 |
+
"Ios17.reduceArgmax" : 1,
|
| 55 |
+
"Ios17.squeeze" : 1,
|
| 56 |
+
"Ios17.cast" : 6,
|
| 57 |
+
"Ios17.linear" : 3,
|
| 58 |
+
"Ios17.transpose" : 2,
|
| 59 |
+
"Ios17.add" : 1,
|
| 60 |
+
"Ios16.relu" : 1,
|
| 61 |
+
"Ios16.softmax" : 1,
|
| 62 |
+
"Ios17.gatherAlongAxis" : 1,
|
| 63 |
+
"Ios17.topk" : 1,
|
| 64 |
+
"Ios17.expandDims" : 3
|
| 65 |
+
},
|
| 66 |
+
"computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
|
| 67 |
+
"isUpdatable" : "0",
|
| 68 |
+
"stateSchema" : [
|
| 69 |
+
|
| 70 |
+
],
|
| 71 |
+
"availability" : {
|
| 72 |
+
"macOS" : "14.0",
|
| 73 |
+
"tvOS" : "17.0",
|
| 74 |
+
"visionOS" : "1.0",
|
| 75 |
+
"watchOS" : "10.0",
|
| 76 |
+
"iOS" : "17.0",
|
| 77 |
+
"macCatalyst" : "17.0"
|
| 78 |
+
},
|
| 79 |
+
"modelType" : {
|
| 80 |
+
"name" : "MLModelType_mlProgram"
|
| 81 |
+
},
|
| 82 |
+
"inputSchema" : [
|
| 83 |
+
{
|
| 84 |
+
"hasShapeFlexibility" : "0",
|
| 85 |
+
"isOptional" : "0",
|
| 86 |
+
"dataType" : "Float32",
|
| 87 |
+
"formattedType" : "MultiArray (Float32 1 × 512 × 1)",
|
| 88 |
+
"shortDescription" : "",
|
| 89 |
+
"shape" : "[1, 512, 1]",
|
| 90 |
+
"name" : "encoder_step",
|
| 91 |
+
"type" : "MultiArray"
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"hasShapeFlexibility" : "0",
|
| 95 |
+
"isOptional" : "0",
|
| 96 |
+
"dataType" : "Float32",
|
| 97 |
+
"formattedType" : "MultiArray (Float32 1 × 640 × 1)",
|
| 98 |
+
"shortDescription" : "",
|
| 99 |
+
"shape" : "[1, 640, 1]",
|
| 100 |
+
"name" : "decoder_step",
|
| 101 |
+
"type" : "MultiArray"
|
| 102 |
+
}
|
| 103 |
+
],
|
| 104 |
+
"userDefinedMetadata" : {
|
| 105 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 106 |
+
"com.github.apple.coremltools.version" : "8.3.0",
|
| 107 |
+
"com.github.apple.coremltools.source" : "torch==2.4.0"
|
| 108 |
+
},
|
| 109 |
+
"generatedClassName" : "parakeet_eou_joint_decision_single_step",
|
| 110 |
+
"method" : "predict"
|
| 111 |
+
}
|
| 112 |
+
]
|
160ms/joint_decision.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios17>(tensor<fp32, [1, 640, 1]> decoder_step, tensor<fp32, [1, 512, 1]> encoder_step) {
|
| 5 |
+
tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 6 |
+
tensor<string, []> encoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 7 |
+
tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 8 |
+
tensor<string, []> decoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 9 |
+
tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 10 |
+
tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
|
| 11 |
+
tensor<fp16, [1, 512, 1]> encoder_step_to_fp16 = cast(dtype = encoder_step_to_fp16_dtype_0, x = encoder_step)[name = tensor<string, []>("cast_8")];
|
| 12 |
+
tensor<fp16, [1, 1, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_step_to_fp16)[name = tensor<string, []>("transpose_1")];
|
| 13 |
+
tensor<fp16, [1, 1, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
|
| 14 |
+
tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
|
| 15 |
+
tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
|
| 16 |
+
tensor<fp16, [1, 640, 1]> decoder_step_to_fp16 = cast(dtype = decoder_step_to_fp16_dtype_0, x = decoder_step)[name = tensor<string, []>("cast_7")];
|
| 17 |
+
tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_step_to_fp16)[name = tensor<string, []>("transpose_0")];
|
| 18 |
+
tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
|
| 19 |
+
tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
|
| 20 |
+
tensor<fp16, [1, 1, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
|
| 21 |
+
tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
|
| 22 |
+
tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
|
| 23 |
+
tensor<fp16, [1, 1, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
|
| 24 |
+
tensor<fp16, [1, 1, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
|
| 25 |
+
tensor<fp16, [1027, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1027, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
|
| 26 |
+
tensor<fp16, [1027]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1027]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2792064)))];
|
| 27 |
+
tensor<fp16, [1, 1, 1, 1027]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
|
| 28 |
+
tensor<int32, []> var_38_axis_0 = const()[name = tensor<string, []>("op_38_axis_0"), val = tensor<int32, []>(-1)];
|
| 29 |
+
tensor<bool, []> var_38_keep_dims_0 = const()[name = tensor<string, []>("op_38_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 30 |
+
tensor<string, []> var_38_output_dtype_0 = const()[name = tensor<string, []>("op_38_output_dtype_0"), val = tensor<string, []>("int32")];
|
| 31 |
+
tensor<int32, [1, 1, 1]> token_id = reduce_argmax(axis = var_38_axis_0, keep_dims = var_38_keep_dims_0, output_dtype = var_38_output_dtype_0, x = linear_2_cast_fp16)[name = tensor<string, []>("op_38_cast_fp16")];
|
| 32 |
+
tensor<int32, []> var_44 = const()[name = tensor<string, []>("op_44"), val = tensor<int32, []>(-1)];
|
| 33 |
+
tensor<fp16, [1, 1, 1, 1027]> token_probs_all_cast_fp16 = softmax(axis = var_44, x = linear_2_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
|
| 34 |
+
tensor<int32, [1]> var_53_axes_0 = const()[name = tensor<string, []>("op_53_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 35 |
+
tensor<int32, [1, 1, 1, 1]> var_53 = expand_dims(axes = var_53_axes_0, x = token_id)[name = tensor<string, []>("op_53")];
|
| 36 |
+
tensor<int32, []> var_54 = const()[name = tensor<string, []>("op_54"), val = tensor<int32, []>(-1)];
|
| 37 |
+
tensor<bool, []> var_56_validate_indices_0 = const()[name = tensor<string, []>("op_56_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 38 |
+
tensor<string, []> var_53_to_int16_dtype_0 = const()[name = tensor<string, []>("op_53_to_int16_dtype_0"), val = tensor<string, []>("int16")];
|
| 39 |
+
tensor<int16, [1, 1, 1, 1]> var_53_to_int16 = cast(dtype = var_53_to_int16_dtype_0, x = var_53)[name = tensor<string, []>("cast_6")];
|
| 40 |
+
tensor<fp16, [1, 1, 1, 1]> var_56_cast_fp16_cast_int16 = gather_along_axis(axis = var_54, indices = var_53_to_int16, validate_indices = var_56_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_56_cast_fp16_cast_int16")];
|
| 41 |
+
tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 42 |
+
tensor<fp16, [1, 1, 1]> var_58_cast_fp16 = squeeze(axes = var_58_axes_0, x = var_56_cast_fp16_cast_int16)[name = tensor<string, []>("op_58_cast_fp16")];
|
| 43 |
+
tensor<string, []> var_58_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_58_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 44 |
+
tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(64)];
|
| 45 |
+
tensor<int32, []> var_63_axis_0 = const()[name = tensor<string, []>("op_63_axis_0"), val = tensor<int32, []>(-1)];
|
| 46 |
+
tensor<bool, []> var_63_ascending_0 = const()[name = tensor<string, []>("op_63_ascending_0"), val = tensor<bool, []>(false)];
|
| 47 |
+
tensor<bool, []> var_63_sort_0 = const()[name = tensor<string, []>("op_63_sort_0"), val = tensor<bool, []>(true)];
|
| 48 |
+
tensor<bool, []> var_63_return_indices_0 = const()[name = tensor<string, []>("op_63_return_indices_0"), val = tensor<bool, []>(true)];
|
| 49 |
+
tensor<string, []> var_63_cast_fp16_cast_int16_output_indices_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_cast_int16_output_indices_dtype_0"), val = tensor<string, []>("uint16")];
|
| 50 |
+
tensor<fp16, [1, 1, 1, 64]> var_63_cast_fp16_cast_int16_0, tensor<uint16, [1, 1, 1, 64]> var_63_cast_fp16_cast_int16_1 = topk(ascending = var_63_ascending_0, axis = var_63_axis_0, k = var_59, output_indices_dtype = var_63_cast_fp16_cast_int16_output_indices_dtype_0, return_indices = var_63_return_indices_0, sort = var_63_sort_0, x = linear_2_cast_fp16)[name = tensor<string, []>("op_63_cast_fp16_cast_int16")];
|
| 51 |
+
tensor<string, []> var_63_cast_fp16_cast_int16_1_to_int32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_cast_int16_1_to_int32_dtype_0"), val = tensor<string, []>("int32")];
|
| 52 |
+
tensor<string, []> var_63_cast_fp16_0_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_0_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 53 |
+
tensor<fp32, [1, 1, 1, 64]> top_k_logits = cast(dtype = var_63_cast_fp16_0_to_fp32_dtype_0, x = var_63_cast_fp16_cast_int16_0)[name = tensor<string, []>("cast_3")];
|
| 54 |
+
tensor<int32, [1, 1, 1, 64]> top_k_ids = cast(dtype = var_63_cast_fp16_cast_int16_1_to_int32_dtype_0, x = var_63_cast_fp16_cast_int16_1)[name = tensor<string, []>("cast_4")];
|
| 55 |
+
tensor<fp32, [1, 1, 1]> token_prob = cast(dtype = var_58_cast_fp16_to_fp32_dtype_0, x = var_58_cast_fp16)[name = tensor<string, []>("cast_5")];
|
| 56 |
+
} -> (token_id, token_prob, top_k_ids, top_k_logits);
|
| 57 |
+
}
|
160ms/joint_decision.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7039b2010a269153f5a96edf28637f921a86ef8822f248f2d6712f7a6bce84b4
|
| 3 |
+
size 2794182
|
160ms/joint_decision.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:25d4d7be6eeb60c7de1d3a1278a5a4700cbe34017e1a8c1cab33204ddb2e4d5e
|
| 3 |
+
size 8701
|
160ms/joint_decision.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7039b2010a269153f5a96edf28637f921a86ef8822f248f2d6712f7a6bce84b4
|
| 3 |
+
size 2794182
|
160ms/joint_decision.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"634E266B-4447-41D3-879E-F3611888F54B": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"C7F40527-180B-45CD-BC12-4F054F2E5D9A": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "C7F40527-180B-45CD-BC12-4F054F2E5D9A"
|
| 18 |
+
}
|
160ms/parakeet_eou_preprocessor.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4ada8b0b99ac1d2ba7acbffacfbbf1a06cb69d30e9410d237ee0aa4c2b0ad63
|
| 3 |
+
size 243
|