aoiandroid alexwengg commited on
Commit
1b8ea0e
·
0 Parent(s):

Duplicate from FluidInference/parakeet-realtime-eou-120m-coreml

Browse files

Co-authored-by: Alex Weng <alexwengg@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. 1280ms/.DS_Store +0 -0
  3. 1280ms/convert_parakeet_eou.py +740 -0
  4. 1280ms/convert_streaming_encoder.py +193 -0
  5. 1280ms/decoder.mlmodelc/analytics/coremldata.bin +3 -0
  6. 1280ms/decoder.mlmodelc/coremldata.bin +3 -0
  7. 1280ms/decoder.mlmodelc/metadata.json +118 -0
  8. 1280ms/decoder.mlmodelc/model.mil +45 -0
  9. 1280ms/decoder.mlmodelc/weights/weight.bin +3 -0
  10. 1280ms/individual_components.py +250 -0
  11. 1280ms/joint_decision.mlmodelc/analytics/coremldata.bin +3 -0
  12. 1280ms/joint_decision.mlmodelc/coremldata.bin +3 -0
  13. 1280ms/joint_decision.mlmodelc/metadata.json +112 -0
  14. 1280ms/joint_decision.mlmodelc/model.mil +57 -0
  15. 1280ms/joint_decision.mlmodelc/weights/weight.bin +3 -0
  16. 1280ms/parakeet_eou_preprocessor.mlmodelc/analytics/coremldata.bin +3 -0
  17. 1280ms/parakeet_eou_preprocessor.mlmodelc/coremldata.bin +3 -0
  18. 1280ms/parakeet_eou_preprocessor.mlmodelc/metadata.json +105 -0
  19. 1280ms/parakeet_eou_preprocessor.mlmodelc/model.mil +96 -0
  20. 1280ms/parakeet_eou_preprocessor.mlmodelc/weights/weight.bin +3 -0
  21. 1280ms/streaming_encoder.mlmodelc/analytics/coremldata.bin +3 -0
  22. 1280ms/streaming_encoder.mlmodelc/coremldata.bin +3 -0
  23. 1280ms/streaming_encoder.mlmodelc/metadata.json +187 -0
  24. 1280ms/streaming_encoder.mlmodelc/model.mil +0 -0
  25. 1280ms/streaming_encoder.mlmodelc/weights/weight.bin +3 -0
  26. 1280ms/streaming_encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  27. 1280ms/streaming_encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  28. 1280ms/streaming_encoder.mlpackage/Manifest.json +18 -0
  29. 1280ms/vocab.json +1028 -0
  30. 160ms/.DS_Store +0 -0
  31. 160ms/convert_parakeet_eou.py +740 -0
  32. 160ms/convert_streaming_encoder.py +193 -0
  33. 160ms/decoder.mlmodelc/analytics/coremldata.bin +3 -0
  34. 160ms/decoder.mlmodelc/coremldata.bin +3 -0
  35. 160ms/decoder.mlmodelc/metadata.json +118 -0
  36. 160ms/decoder.mlmodelc/model.mil +45 -0
  37. 160ms/decoder.mlmodelc/weights/weight.bin +3 -0
  38. 160ms/decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  39. 160ms/decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  40. 160ms/decoder.mlpackage/Manifest.json +18 -0
  41. 160ms/individual_components.py +250 -0
  42. 160ms/joint_decision.mlmodelc/analytics/coremldata.bin +3 -0
  43. 160ms/joint_decision.mlmodelc/coremldata.bin +3 -0
  44. 160ms/joint_decision.mlmodelc/metadata.json +112 -0
  45. 160ms/joint_decision.mlmodelc/model.mil +57 -0
  46. 160ms/joint_decision.mlmodelc/weights/weight.bin +3 -0
  47. 160ms/joint_decision.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  48. 160ms/joint_decision.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  49. 160ms/joint_decision.mlpackage/Manifest.json +18 -0
  50. 160ms/parakeet_eou_preprocessor.mlmodelc/analytics/coremldata.bin +3 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
1280ms/.DS_Store ADDED
Binary file (6.15 kB). View file
 
1280ms/convert_parakeet_eou.py ADDED
@@ -0,0 +1,740 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """CLI for exporting Parakeet Realtime EOU 120M components to CoreML.
3
+
4
+ This model is a cache-aware streaming FastConformer-RNNT model optimized for
5
+ low-latency speech recognition with end-of-utterance detection.
6
+
7
+ Key differences from Parakeet TDT v3:
8
+ - Smaller model (120M vs 600M params)
9
+ - No duration outputs (standard RNNT, not TDT)
10
+ - Cache-aware streaming encoder (17 layers, attention context [70,1])
11
+ - Special <EOU> token for end-of-utterance detection
12
+ - Optimized for 80-160ms latency
13
+
14
+ Reference: https://huggingface.co/nvidia/parakeet_realtime_eou_120m-v1
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ from dataclasses import asdict
20
+ from pathlib import Path
21
+ from typing import Dict, Optional, Tuple
22
+
23
+ import coremltools as ct
24
+ import numpy as np
25
+ import soundfile as sf
26
+ import torch
27
+ import typer
28
+
29
+ import nemo.collections.asr as nemo_asr
30
+
31
+ from individual_components import (
32
+ DecoderWrapper,
33
+ EncoderWrapper,
34
+ ExportSettings,
35
+ JointWrapper,
36
+ JointDecisionWrapper,
37
+ JointDecisionSingleStep,
38
+ PreprocessorWrapper,
39
+ MelEncoderWrapper,
40
+ _coreml_convert,
41
+ )
42
+
43
+ def apply_stft_patch():
44
+ # Monkey patch coremltools.stft to handle extra arguments from newer torch versions
45
+ try:
46
+ import coremltools.converters.mil.frontend.torch.ops as torch_ops
47
+ _original_stft = torch_ops.stft
48
+
49
+ def patched_stft(context, node):
50
+ if len(node.inputs) > 8:
51
+ node.inputs = node.inputs[:8]
52
+ return _original_stft(context, node)
53
+
54
+ torch_ops.stft = patched_stft
55
+ if "stft" in torch_ops._TORCH_OPS_REGISTRY:
56
+ torch_ops._TORCH_OPS_REGISTRY["stft"] = patched_stft
57
+ print("Monkey patched coremltools.stft for compatibility.")
58
+ except Exception as e:
59
+ print(f"Warning: Could not monkey patch stft: {e}")
60
+
61
+ DEFAULT_MODEL_ID = "nvidia/parakeet_realtime_eou_120m-v1"
62
+ AUTHOR = "Fluid Inference"
63
+
64
+
65
+ def _compute_length(seconds: float, sample_rate: int) -> int:
66
+ return int(round(seconds * sample_rate))
67
+
68
+
69
+ def _prepare_audio(
70
+ validation_audio: Optional[Path],
71
+ sample_rate: int,
72
+ max_samples: int,
73
+ seed: Optional[int],
74
+ ) -> torch.Tensor:
75
+ if validation_audio is None:
76
+ if seed is not None:
77
+ torch.manual_seed(seed)
78
+ audio = torch.randn(1, max_samples, dtype=torch.float32)
79
+ return audio
80
+
81
+ data, sr = sf.read(str(validation_audio), dtype="float32")
82
+ if sr != sample_rate:
83
+ raise typer.BadParameter(
84
+ f"Validation audio sample rate {sr} does not match model rate {sample_rate}"
85
+ )
86
+
87
+ if data.ndim > 1:
88
+ data = data[:, 0]
89
+
90
+ if data.size == 0:
91
+ raise typer.BadParameter("Validation audio is empty")
92
+
93
+ if data.size < max_samples:
94
+ pad_width = max_samples - data.size
95
+ data = np.pad(data, (0, pad_width))
96
+ elif data.size > max_samples:
97
+ data = data[:max_samples]
98
+
99
+ audio = torch.from_numpy(data).unsqueeze(0).to(dtype=torch.float32)
100
+ return audio
101
+
102
+
103
+ def _save_mlpackage(model: ct.models.MLModel, path: Path, description: str) -> None:
104
+ try:
105
+ model.minimum_deployment_target = ct.target.iOS17
106
+ except Exception:
107
+ pass
108
+ model.short_description = description
109
+ model.author = AUTHOR
110
+ path.parent.mkdir(parents=True, exist_ok=True)
111
+ model.save(str(path))
112
+
113
+
114
+ def _tensor_shape(tensor: torch.Tensor) -> Tuple[int, ...]:
115
+ return tuple(int(dim) for dim in tensor.shape)
116
+
117
+
118
+ def _parse_compute_units(name: str) -> ct.ComputeUnit:
119
+ """Parse a human-friendly compute units string into ct.ComputeUnit."""
120
+ normalized = str(name).strip().upper()
121
+ mapping = {
122
+ "ALL": ct.ComputeUnit.ALL,
123
+ "CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
124
+ "CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
125
+ "CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
126
+ "CPU_AND_NEURALENGINE": ct.ComputeUnit.CPU_AND_NE,
127
+ }
128
+ if normalized not in mapping:
129
+ raise typer.BadParameter(
130
+ f"Unknown compute units '{name}'. Choose from: " + ", ".join(mapping.keys())
131
+ )
132
+ return mapping[normalized]
133
+
134
+
135
+ def _parse_compute_precision(name: Optional[str]) -> Optional[ct.precision]:
136
+ """Parse compute precision string into ct.precision or None."""
137
+ if name is None:
138
+ return None
139
+ normalized = str(name).strip().upper()
140
+ if normalized == "":
141
+ return None
142
+ mapping = {
143
+ "FLOAT32": ct.precision.FLOAT32,
144
+ "FLOAT16": ct.precision.FLOAT16,
145
+ }
146
+ if normalized not in mapping:
147
+ raise typer.BadParameter(
148
+ f"Unknown compute precision '{name}'. Choose from: "
149
+ + ", ".join(mapping.keys())
150
+ )
151
+ return mapping[normalized]
152
+
153
+
154
+ app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
155
+
156
+
157
+ @app.command()
158
+ def convert(
159
+ nemo_path: Optional[Path] = typer.Option(
160
+ None,
161
+ "--nemo-path",
162
+ exists=True,
163
+ resolve_path=True,
164
+ help="Path to parakeet_realtime_eou_120m-v1.nemo checkpoint (skip to auto-download)",
165
+ ),
166
+ model_id: str = typer.Option(
167
+ DEFAULT_MODEL_ID,
168
+ "--model-id",
169
+ help="Model identifier to download when --nemo-path is omitted",
170
+ ),
171
+ output_dir: Path = typer.Option(
172
+ Path("parakeet_eou_coreml"),
173
+ help="Directory where mlpackages and metadata will be written",
174
+ ),
175
+ preprocessor_cu: str = typer.Option(
176
+ "CPU_ONLY",
177
+ "--preprocessor-cu",
178
+ help="Compute units for preprocessor (default CPU_ONLY)",
179
+ ),
180
+ mel_encoder_cu: str = typer.Option(
181
+ "CPU_ONLY",
182
+ "--mel-encoder-cu",
183
+ help="Compute units for fused mel+encoder (default CPU_ONLY)",
184
+ ),
185
+ compute_precision: Optional[str] = typer.Option(
186
+ None,
187
+ "--compute-precision",
188
+ help="Export precision: FLOAT32 (default) or FLOAT16 to shrink non-quantized weights.",
189
+ ),
190
+ max_audio_seconds: float = typer.Option(
191
+ 15.0,
192
+ "--max-audio-seconds",
193
+ help="Maximum audio duration in seconds for the fixed window export",
194
+ ),
195
+ validation_audio: Optional[Path] = typer.Option(
196
+ None,
197
+ "--validation-audio",
198
+ exists=True,
199
+ resolve_path=True,
200
+ help="Path to a 16kHz WAV file for tracing (uses random if not provided)",
201
+ ),
202
+ ) -> None:
203
+ """Export all Parakeet Realtime EOU sub-modules to CoreML.
204
+
205
+ This exports the cache-aware streaming FastConformer-RNNT model for
206
+ low-latency speech recognition with end-of-utterance detection.
207
+ """
208
+ export_settings = ExportSettings(
209
+ output_dir=output_dir,
210
+ compute_units=ct.ComputeUnit.CPU_ONLY,
211
+ deployment_target=ct.target.iOS17,
212
+ compute_precision=_parse_compute_precision(compute_precision),
213
+ max_audio_seconds=max_audio_seconds,
214
+ max_symbol_steps=1,
215
+ )
216
+
217
+ typer.echo("Export configuration:")
218
+ typer.echo(asdict(export_settings))
219
+
220
+ output_dir.mkdir(parents=True, exist_ok=True)
221
+ pre_cu = _parse_compute_units(preprocessor_cu)
222
+ melenc_cu = _parse_compute_units(mel_encoder_cu)
223
+
224
+ if nemo_path is not None:
225
+ typer.echo(f"Loading NeMo model from {nemo_path}…")
226
+ # Try loading as generic ASRModel first, then specific class
227
+ try:
228
+ asr_model = nemo_asr.models.ASRModel.restore_from(
229
+ str(nemo_path), map_location="cpu"
230
+ )
231
+ except Exception:
232
+ # Fallback to EncDecRNNTBPEModel
233
+ asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
234
+ str(nemo_path), map_location="cpu"
235
+ )
236
+ checkpoint_meta = {
237
+ "type": "file",
238
+ "path": str(nemo_path),
239
+ }
240
+ else:
241
+ typer.echo(f"Downloading NeMo model via {model_id}…")
242
+ # Use ASRModel.from_pretrained as recommended for this model
243
+ try:
244
+ asr_model = nemo_asr.models.ASRModel.from_pretrained(
245
+ model_id, map_location="cpu"
246
+ )
247
+ except Exception:
248
+ # Fallback to EncDecRNNTBPEModel
249
+ asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
250
+ model_id, map_location="cpu"
251
+ )
252
+ checkpoint_meta = {
253
+ "type": "pretrained",
254
+ "model_id": model_id,
255
+ }
256
+ asr_model.eval()
257
+
258
+ # Print model info
259
+ typer.echo(f"Model class: {type(asr_model).__name__}")
260
+ typer.echo(f"Encoder class: {type(asr_model.encoder).__name__}")
261
+
262
+ sample_rate = int(asr_model.cfg.preprocessor.sample_rate)
263
+ max_samples = _compute_length(export_settings.max_audio_seconds, sample_rate)
264
+
265
+ # Prepare audio for tracing
266
+ if validation_audio is not None:
267
+ typer.echo(f"Using validation audio: {validation_audio}")
268
+ audio_tensor = _prepare_audio(validation_audio, sample_rate, max_samples, seed=None)
269
+ else:
270
+ typer.echo("Using random audio for tracing (seed=42)")
271
+ audio_tensor = _prepare_audio(None, sample_rate, max_samples, seed=42)
272
+
273
+ audio_length = torch.tensor([max_samples], dtype=torch.int32)
274
+
275
+ preprocessor = PreprocessorWrapper(asr_model.preprocessor.eval())
276
+ encoder = EncoderWrapper(asr_model.encoder.eval())
277
+ decoder = DecoderWrapper(asr_model.decoder.eval())
278
+ joint = JointWrapper(asr_model.joint.eval())
279
+
280
+ decoder_export_flag = getattr(asr_model.decoder, "_rnnt_export", False)
281
+ asr_model.decoder._rnnt_export = True
282
+
283
+ try:
284
+ with torch.no_grad():
285
+ mel_ref, mel_length_ref = preprocessor(audio_tensor, audio_length)
286
+ mel_length_ref = mel_length_ref.to(dtype=torch.int32)
287
+ encoder_ref, encoder_length_ref, frame_times_ref = encoder(
288
+ mel_ref, mel_length_ref
289
+ )
290
+ encoder_length_ref = encoder_length_ref.to(dtype=torch.int32)
291
+
292
+ # Clone tensors to drop inference flags
293
+ mel_ref = mel_ref.clone().detach()
294
+ mel_length_ref = mel_length_ref.clone().detach()
295
+ encoder_ref = encoder_ref.clone().detach()
296
+ encoder_length_ref = encoder_length_ref.clone().detach()
297
+ frame_times_ref = frame_times_ref.clone().detach()
298
+
299
+ vocab_size = int(asr_model.tokenizer.vocab_size)
300
+ decoder_hidden = int(asr_model.decoder.pred_hidden)
301
+ decoder_layers = int(asr_model.decoder.pred_rnn_layers)
302
+
303
+ # Check if model has extra outputs (TDT-style duration)
304
+ num_extra = getattr(asr_model.joint, "num_extra_outputs", 0)
305
+ typer.echo(f"Vocab size: {vocab_size}, num_extra_outputs: {num_extra}")
306
+
307
+ targets = torch.full(
308
+ (1, export_settings.max_symbol_steps),
309
+ fill_value=asr_model.decoder.blank_idx,
310
+ dtype=torch.int32,
311
+ )
312
+ target_lengths = torch.tensor(
313
+ [export_settings.max_symbol_steps], dtype=torch.int32
314
+ )
315
+ zero_state = torch.zeros(
316
+ decoder_layers,
317
+ 1,
318
+ decoder_hidden,
319
+ dtype=torch.float32,
320
+ )
321
+
322
+ with torch.no_grad():
323
+ decoder_ref, h_ref, c_ref = decoder(
324
+ targets, target_lengths, zero_state, zero_state
325
+ )
326
+ joint_ref = joint(encoder_ref, decoder_ref)
327
+
328
+ decoder_ref = decoder_ref.clone()
329
+ h_ref = h_ref.clone()
330
+ c_ref = c_ref.clone()
331
+ joint_ref = joint_ref.clone()
332
+
333
+ typer.echo(f"Encoder output shape: {encoder_ref.shape}")
334
+ typer.echo(f"Decoder output shape: {decoder_ref.shape}")
335
+ typer.echo(f"Joint output shape: {joint_ref.shape}")
336
+
337
+ # === Export Preprocessor ===
338
+ typer.echo("Tracing and converting preprocessor…")
339
+ preprocessor = preprocessor.cpu()
340
+ audio_tensor = audio_tensor.cpu()
341
+ audio_length = audio_length.cpu()
342
+ traced_preprocessor = torch.jit.trace(
343
+ preprocessor, (audio_tensor, audio_length), strict=False
344
+ )
345
+ traced_preprocessor.eval()
346
+ preprocessor_inputs = [
347
+ ct.TensorType(
348
+ name="audio_signal",
349
+ shape=(1, ct.RangeDim(1, max_samples)),
350
+ dtype=np.float32,
351
+ ),
352
+ ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
353
+ ]
354
+ preprocessor_outputs = [
355
+ ct.TensorType(name="mel", dtype=np.float32),
356
+ ct.TensorType(name="mel_length", dtype=np.int32),
357
+ ]
358
+ preprocessor_model = _coreml_convert(
359
+ traced_preprocessor,
360
+ preprocessor_inputs,
361
+ preprocessor_outputs,
362
+ export_settings,
363
+ compute_units_override=pre_cu,
364
+ )
365
+ preprocessor_path = output_dir / "parakeet_eou_preprocessor.mlpackage"
366
+ _save_mlpackage(
367
+ preprocessor_model,
368
+ preprocessor_path,
369
+ f"Parakeet EOU preprocessor ({max_audio_seconds}s window)",
370
+ )
371
+
372
+ # === Export Encoder ===
373
+ typer.echo("Tracing and converting encoder…")
374
+ traced_encoder = torch.jit.trace(
375
+ encoder, (mel_ref, mel_length_ref), strict=False
376
+ )
377
+ traced_encoder.eval()
378
+ encoder_inputs = [
379
+ ct.TensorType(
380
+ name="mel", shape=_tensor_shape(mel_ref), dtype=np.float32
381
+ ),
382
+ ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
383
+ ]
384
+ encoder_outputs = [
385
+ ct.TensorType(name="encoder", dtype=np.float32),
386
+ ct.TensorType(name="encoder_length", dtype=np.int32),
387
+ ct.TensorType(name="frame_times", dtype=np.float32),
388
+ ]
389
+ encoder_model = _coreml_convert(
390
+ traced_encoder,
391
+ encoder_inputs,
392
+ encoder_outputs,
393
+ export_settings,
394
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
395
+ )
396
+ encoder_path = output_dir / "parakeet_eou_encoder.mlpackage"
397
+ _save_mlpackage(
398
+ encoder_model,
399
+ encoder_path,
400
+ f"Parakeet EOU encoder ({max_audio_seconds}s window)",
401
+ )
402
+
403
+ # === Export Fused Mel+Encoder ===
404
+ typer.echo("Tracing and converting fused mel+encoder…")
405
+ mel_encoder = MelEncoderWrapper(preprocessor, encoder)
406
+ traced_mel_encoder = torch.jit.trace(
407
+ mel_encoder, (audio_tensor, audio_length), strict=False
408
+ )
409
+ traced_mel_encoder.eval()
410
+ mel_encoder_inputs = [
411
+ ct.TensorType(
412
+ name="audio_signal", shape=(1, max_samples), dtype=np.float32
413
+ ),
414
+ ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
415
+ ]
416
+ mel_encoder_outputs = [
417
+ ct.TensorType(name="encoder", dtype=np.float32),
418
+ ct.TensorType(name="encoder_length", dtype=np.int32),
419
+ ct.TensorType(name="frame_times", dtype=np.float32),
420
+ ]
421
+ mel_encoder_model = _coreml_convert(
422
+ traced_mel_encoder,
423
+ mel_encoder_inputs,
424
+ mel_encoder_outputs,
425
+ export_settings,
426
+ compute_units_override=melenc_cu,
427
+ )
428
+ mel_encoder_path = output_dir / "parakeet_eou_mel_encoder.mlpackage"
429
+ _save_mlpackage(
430
+ mel_encoder_model,
431
+ mel_encoder_path,
432
+ f"Parakeet EOU fused Mel+Encoder ({max_audio_seconds}s window)",
433
+ )
434
+
435
+ # === Export Decoder ===
436
+ typer.echo("Tracing and converting decoder…")
437
+ traced_decoder = torch.jit.trace(
438
+ decoder,
439
+ (targets, target_lengths, zero_state, zero_state),
440
+ strict=False,
441
+ )
442
+ traced_decoder.eval()
443
+ decoder_inputs = [
444
+ ct.TensorType(
445
+ name="targets", shape=_tensor_shape(targets), dtype=np.int32
446
+ ),
447
+ ct.TensorType(name="target_length", shape=(1,), dtype=np.int32),
448
+ ct.TensorType(
449
+ name="h_in", shape=_tensor_shape(zero_state), dtype=np.float32
450
+ ),
451
+ ct.TensorType(
452
+ name="c_in", shape=_tensor_shape(zero_state), dtype=np.float32
453
+ ),
454
+ ]
455
+ decoder_outputs = [
456
+ ct.TensorType(name="decoder", dtype=np.float32),
457
+ ct.TensorType(name="h_out", dtype=np.float32),
458
+ ct.TensorType(name="c_out", dtype=np.float32),
459
+ ]
460
+ decoder_model = _coreml_convert(
461
+ traced_decoder,
462
+ decoder_inputs,
463
+ decoder_outputs,
464
+ export_settings,
465
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
466
+ )
467
+ decoder_path = output_dir / "parakeet_eou_decoder.mlpackage"
468
+ _save_mlpackage(
469
+ decoder_model,
470
+ decoder_path,
471
+ "Parakeet EOU decoder (RNNT prediction network)",
472
+ )
473
+
474
+ # === Export Joint ===
475
+ typer.echo("Tracing and converting joint…")
476
+ traced_joint = torch.jit.trace(
477
+ joint,
478
+ (encoder_ref, decoder_ref),
479
+ strict=False,
480
+ )
481
+ traced_joint.eval()
482
+ joint_inputs = [
483
+ ct.TensorType(
484
+ name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32
485
+ ),
486
+ ct.TensorType(
487
+ name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32
488
+ ),
489
+ ]
490
+ joint_outputs = [
491
+ ct.TensorType(name="logits", dtype=np.float32),
492
+ ]
493
+ joint_model = _coreml_convert(
494
+ traced_joint,
495
+ joint_inputs,
496
+ joint_outputs,
497
+ export_settings,
498
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
499
+ )
500
+ joint_path = output_dir / "parakeet_eou_joint.mlpackage"
501
+ _save_mlpackage(
502
+ joint_model,
503
+ joint_path,
504
+ "Parakeet EOU joint network (RNNT)",
505
+ )
506
+
507
+ # === Export Joint Decision Head ===
508
+ typer.echo("Tracing and converting joint decision head…")
509
+ joint_decision = JointDecisionWrapper(joint, vocab_size=vocab_size)
510
+ traced_joint_decision = torch.jit.trace(
511
+ joint_decision,
512
+ (encoder_ref, decoder_ref),
513
+ strict=False,
514
+ )
515
+ traced_joint_decision.eval()
516
+ joint_decision_inputs = [
517
+ ct.TensorType(
518
+ name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32
519
+ ),
520
+ ct.TensorType(
521
+ name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32
522
+ ),
523
+ ]
524
+ joint_decision_outputs = [
525
+ ct.TensorType(name="token_id", dtype=np.int32),
526
+ ct.TensorType(name="token_prob", dtype=np.float32),
527
+ ]
528
+ joint_decision_model = _coreml_convert(
529
+ traced_joint_decision,
530
+ joint_decision_inputs,
531
+ joint_decision_outputs,
532
+ export_settings,
533
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
534
+ )
535
+ joint_decision_path = output_dir / "parakeet_eou_joint_decision.mlpackage"
536
+ _save_mlpackage(
537
+ joint_decision_model,
538
+ joint_decision_path,
539
+ "Parakeet EOU joint + decision head (softmax, argmax)",
540
+ )
541
+
542
+ # === Export Single-Step Joint Decision ===
543
+ typer.echo("Tracing and converting single-step joint decision…")
544
+ jd_single = JointDecisionSingleStep(joint, vocab_size=vocab_size)
545
+ # Create single-step slices from refs
546
+ enc_step = encoder_ref[:, :, :1].contiguous()
547
+ dec_step = decoder_ref[:, :, :1].contiguous()
548
+ traced_jd_single = torch.jit.trace(
549
+ jd_single,
550
+ (enc_step, dec_step),
551
+ strict=False,
552
+ )
553
+ traced_jd_single.eval()
554
+ jd_single_inputs = [
555
+ ct.TensorType(
556
+ name="encoder_step",
557
+ shape=(1, enc_step.shape[1], 1),
558
+ dtype=np.float32,
559
+ ),
560
+ ct.TensorType(
561
+ name="decoder_step",
562
+ shape=(1, dec_step.shape[1], 1),
563
+ dtype=np.float32,
564
+ ),
565
+ ]
566
+ jd_single_outputs = [
567
+ ct.TensorType(name="token_id", dtype=np.int32),
568
+ ct.TensorType(name="token_prob", dtype=np.float32),
569
+ ct.TensorType(name="top_k_ids", dtype=np.int32),
570
+ ct.TensorType(name="top_k_logits", dtype=np.float32),
571
+ ]
572
+ jd_single_model = _coreml_convert(
573
+ traced_jd_single,
574
+ jd_single_inputs,
575
+ jd_single_outputs,
576
+ export_settings,
577
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
578
+ )
579
+ jd_single_path = output_dir / "parakeet_eou_joint_decision_single_step.mlpackage"
580
+ _save_mlpackage(
581
+ jd_single_model,
582
+ jd_single_path,
583
+ "Parakeet EOU single-step joint decision (current frame)",
584
+ )
585
+
586
+ # === Save Metadata ===
587
+ metadata: Dict[str, object] = {
588
+ "model_id": model_id,
589
+ "model_name": "parakeet_realtime_eou_120m-v1",
590
+ "model_class": type(asr_model).__name__,
591
+ "encoder_class": type(asr_model.encoder).__name__,
592
+ "sample_rate": sample_rate,
593
+ "max_audio_seconds": export_settings.max_audio_seconds,
594
+ "max_audio_samples": max_samples,
595
+ "max_symbol_steps": export_settings.max_symbol_steps,
596
+ "vocab_size": vocab_size,
597
+ "vocab_with_blank": vocab_size + 1,
598
+ "decoder_hidden": decoder_hidden,
599
+ "decoder_layers": decoder_layers,
600
+ "num_extra_outputs": num_extra,
601
+ "has_eou_token": True,
602
+ "checkpoint": checkpoint_meta,
603
+ "coreml": {
604
+ "compute_units": export_settings.compute_units.name,
605
+ "compute_precision": (
606
+ export_settings.compute_precision.name
607
+ if export_settings.compute_precision is not None
608
+ else "FLOAT32"
609
+ ),
610
+ },
611
+ "components": {
612
+ "preprocessor": {
613
+ "inputs": {
614
+ "audio_signal": [1, max_samples],
615
+ "audio_length": [1],
616
+ },
617
+ "outputs": {
618
+ "mel": list(_tensor_shape(mel_ref)),
619
+ "mel_length": [1],
620
+ },
621
+ "path": preprocessor_path.name,
622
+ },
623
+ "encoder": {
624
+ "inputs": {
625
+ "mel": list(_tensor_shape(mel_ref)),
626
+ "mel_length": [1],
627
+ },
628
+ "outputs": {
629
+ "encoder": list(_tensor_shape(encoder_ref)),
630
+ "encoder_length": [1],
631
+ "frame_times": [1, _tensor_shape(encoder_ref)[2]],
632
+ },
633
+ "path": encoder_path.name,
634
+ },
635
+ "mel_encoder": {
636
+ "inputs": {
637
+ "audio_signal": [1, max_samples],
638
+ "audio_length": [1],
639
+ },
640
+ "outputs": {
641
+ "encoder": list(_tensor_shape(encoder_ref)),
642
+ "encoder_length": [1],
643
+ "frame_times": [1, _tensor_shape(encoder_ref)[2]],
644
+ },
645
+ "path": mel_encoder_path.name,
646
+ },
647
+ "decoder": {
648
+ "inputs": {
649
+ "targets": list(_tensor_shape(targets)),
650
+ "target_length": [1],
651
+ "h_in": list(_tensor_shape(zero_state)),
652
+ "c_in": list(_tensor_shape(zero_state)),
653
+ },
654
+ "outputs": {
655
+ "decoder": list(_tensor_shape(decoder_ref)),
656
+ "h_out": list(_tensor_shape(h_ref)),
657
+ "c_out": list(_tensor_shape(c_ref)),
658
+ },
659
+ "path": decoder_path.name,
660
+ },
661
+ "joint": {
662
+ "inputs": {
663
+ "encoder": list(_tensor_shape(encoder_ref)),
664
+ "decoder": list(_tensor_shape(decoder_ref)),
665
+ },
666
+ "outputs": {
667
+ "logits": list(_tensor_shape(joint_ref)),
668
+ },
669
+ "path": joint_path.name,
670
+ },
671
+ "joint_decision": {
672
+ "inputs": {
673
+ "encoder": list(_tensor_shape(encoder_ref)),
674
+ "decoder": list(_tensor_shape(decoder_ref)),
675
+ },
676
+ "outputs": {
677
+ "token_id": [
678
+ _tensor_shape(encoder_ref)[0],
679
+ _tensor_shape(encoder_ref)[2],
680
+ _tensor_shape(decoder_ref)[2],
681
+ ],
682
+ "token_prob": [
683
+ _tensor_shape(encoder_ref)[0],
684
+ _tensor_shape(encoder_ref)[2],
685
+ _tensor_shape(decoder_ref)[2],
686
+ ],
687
+ },
688
+ "path": joint_decision_path.name,
689
+ },
690
+ "joint_decision_single_step": {
691
+ "inputs": {
692
+ "encoder_step": [1, _tensor_shape(encoder_ref)[1], 1],
693
+ "decoder_step": [1, _tensor_shape(decoder_ref)[1], 1],
694
+ },
695
+ "outputs": {
696
+ "token_id": [1, 1, 1],
697
+ "token_prob": [1, 1, 1],
698
+ "top_k_ids": [1, 1, 1, 64],
699
+ "top_k_logits": [1, 1, 1, 64],
700
+ },
701
+ "path": jd_single_path.name,
702
+ },
703
+ },
704
+ }
705
+
706
+ # Export tokenizer vocab if available
707
+ try:
708
+ tokenizer = asr_model.tokenizer
709
+ vocab = {
710
+ "blank_id": int(asr_model.decoder.blank_idx),
711
+ "vocab_size": vocab_size,
712
+ }
713
+ # Try to get special tokens
714
+ if hasattr(tokenizer, "tokenizer"):
715
+ inner_tokenizer = tokenizer.tokenizer
716
+ if hasattr(inner_tokenizer, "get_vocab"):
717
+ full_vocab = inner_tokenizer.get_vocab()
718
+ # Find EOU token
719
+ eou_token = None
720
+ for token, idx in full_vocab.items():
721
+ if "<EOU>" in token.upper() or "eou" in token.lower():
722
+ eou_token = {"token": token, "id": idx}
723
+ break
724
+ if eou_token:
725
+ vocab["eou_token"] = eou_token
726
+ metadata["tokenizer"] = vocab
727
+ except Exception as e:
728
+ typer.echo(f"Warning: Could not export tokenizer info: {e}")
729
+
730
+ metadata_path = output_dir / "metadata.json"
731
+ metadata_path.write_text(json.dumps(metadata, indent=2))
732
+ typer.echo(f"\nExport complete. Metadata written to {metadata_path}")
733
+ typer.echo(f"Output directory: {output_dir}")
734
+
735
+ finally:
736
+ asr_model.decoder._rnnt_export = decoder_export_flag
737
+
738
+
739
+ if __name__ == "__main__":
740
+ app()
1280ms/convert_streaming_encoder.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import torch.nn as nn
4
+ import coremltools as ct
5
+ import numpy as np
6
+ import typer
7
+ from pathlib import Path
8
+ from typing import Tuple, List, Optional
9
+ import json
10
+ import shutil
11
+
12
+ # Iimport torch
13
+ import coremltools as ct
14
+ import numpy as np
15
+ import argparse
16
+ from nemo.collections.asr.models import EncDecRNNTBPEModel
17
+
18
+ app = typer.Typer()
19
+
20
+ class LoopbackEncoderWrapper(nn.Module):
21
+ """
22
+ Wraps the entire Parakeet Encoder (PreEncode + Conformer) for CoreML Loopback Streaming.
23
+
24
+ Inputs:
25
+ - audio_signal: [B, D, T] (Mel spectrogram chunk)
26
+ - audio_length: [B]
27
+ - pre_cache: [B, D, pre_cache_size] (Previous audio context)
28
+ - cache_last_channel: [layers, B, cache_size, hidden]
29
+ - cache_last_time: [layers, B, hidden, time_cache]
30
+ - cache_last_channel_len: [B]
31
+
32
+ Outputs:
33
+ - encoded_output: [B, D_out, T_out]
34
+ - encoded_length: [B]
35
+ - new_pre_cache: [B, D, pre_cache_size]
36
+ - new_cache_last_channel
37
+ - new_cache_last_time
38
+ - new_cache_last_channel_len
39
+ """
40
+ def __init__(self, encoder, pre_cache_size=16):
41
+ super().__init__()
42
+ self.encoder = encoder
43
+ self.pre_cache_size = pre_cache_size
44
+
45
+ def forward(
46
+ self,
47
+ audio_signal: torch.Tensor,
48
+ audio_length: torch.Tensor,
49
+ pre_cache: torch.Tensor,
50
+ cache_last_channel: torch.Tensor,
51
+ cache_last_time: torch.Tensor,
52
+ cache_last_channel_len: torch.Tensor
53
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
54
+
55
+ # 1. Prepend pre_cache to audio_signal
56
+ # audio_signal: [B, D, T]
57
+ # pre_cache: [B, D, T_cache]
58
+ full_input = torch.cat([pre_cache, audio_signal], dim=2)
59
+ full_length = audio_length + self.pre_cache_size
60
+
61
+ # 2. Extract NEW pre_cache (last N frames of full_input)
62
+ # Note: We do this BEFORE processing because we want the raw audio context
63
+ new_pre_cache = full_input[:, :, -self.pre_cache_size:]
64
+
65
+ # 3. Process with Encoder
66
+ # Reconstruct NeMo cache object
67
+ current_cache = [cache_last_channel, cache_last_time, cache_last_channel_len]
68
+
69
+ encoded, encoded_len, new_cache_channel, new_cache_time, new_cache_len = self.encoder.cache_aware_stream_step(
70
+ processed_signal=full_input,
71
+ processed_signal_length=full_length,
72
+ cache_last_channel=cache_last_channel,
73
+ cache_last_time=cache_last_time,
74
+ cache_last_channel_len=cache_last_channel_len
75
+ )
76
+
77
+ # 4. Drop the first few frames corresponding to pre_cache?
78
+ # NeMo's cache_aware_stream_step usually handles the "valid" output frames.
79
+ # But since we manually prepended, we might get extra output frames.
80
+ # However, for streaming, we usually want the model to see the context but only output the new tokens.
81
+ # Let's trust NeMo's streaming logic for now, or check if we need to slice.
82
+ # Given we are using 'cache_aware_stream_step', it expects the full context window?
83
+ # Actually, standard usage is: input IS the new chunk, but internal convolution looks at past.
84
+ # But since we are stateless, we MUST provide the past.
85
+ # So passing (pre_cache + chunk) is correct.
86
+
87
+ # Cast lengths to Int32 for CoreML
88
+ encoded_len_32 = encoded_len.to(dtype=torch.int32)
89
+ new_channel_len_32 = new_cache_len.to(dtype=torch.int32)
90
+
91
+ return encoded, encoded_len_32, new_pre_cache, new_cache_channel, new_cache_time, new_channel_len_32
92
+
93
+ def _coreml_convert(
94
+ traced_model,
95
+ inputs,
96
+ outputs,
97
+ compute_units=ct.ComputeUnit.CPU_ONLY
98
+ ):
99
+ return ct.convert(
100
+ traced_model,
101
+ inputs=inputs,
102
+ outputs=outputs,
103
+ compute_units=compute_units,
104
+ minimum_deployment_target=ct.target.macOS14,
105
+ )
106
+
107
+ def main():
108
+ model_id: str = "nvidia/parakeet_realtime_eou_120m-v1"
109
+ output_dir: str = "temp_swift_models/StreamingLoopback"
110
+ output_path = Path(output_dir)
111
+ output_path.mkdir(parents=True, exist_ok=True)
112
+
113
+ print(f"Loading model: {model_id}...")
114
+ asr_model = EncDecRNNTBPEModel.from_pretrained(model_name=model_id)
115
+ asr_model.eval()
116
+
117
+ parser = argparse.ArgumentParser()
118
+ parser.add_argument("--chunk-frames", type=int, default=17, help="Number of frames in the input chunk (e.g. 17 for 160ms, 129 for 1.28s)")
119
+ args = parser.parse_args()
120
+
121
+ encoder = asr_model.encoder
122
+
123
+ # --- Configuration ---
124
+ # 160ms chunk = 16 frames (but preprocessor produces 17 with padding/centering)
125
+ # 1.28s chunk = 128 frames (preprocessor produces 129)
126
+ chunk_size_in = args.chunk_frames
127
+ mel_dim = 128
128
+ hidden_dim = encoder.d_model # 512
129
+ num_layers = len(encoder.layers) # 17
130
+
131
+ # Cache sizes
132
+ cache_channel_size = 70
133
+ cache_time_size = 8
134
+ pre_cache_size = 16
135
+
136
+ print(f"Config: Chunk={chunk_size_in}, Mel={mel_dim}, Hidden={hidden_dim}, Layers={num_layers}")
137
+ print(f"Cache: Channel={cache_channel_size}, Time={cache_time_size}, Pre={pre_cache_size}")
138
+
139
+ # --- Wrapper ---
140
+ wrapper = LoopbackEncoderWrapper(encoder, pre_cache_size=pre_cache_size)
141
+ wrapper.eval()
142
+
143
+ # --- Test Inputs (for Tracing) ---
144
+ batch_size = 1
145
+ test_mel = torch.randn(batch_size, mel_dim, chunk_size_in)
146
+ test_mel_len = torch.tensor([chunk_size_in], dtype=torch.int32)
147
+ test_pre_cache = torch.zeros(batch_size, mel_dim, pre_cache_size)
148
+
149
+ # Initial Cache (Zeros)
150
+ test_cache_channel = torch.zeros(num_layers, batch_size, cache_channel_size, hidden_dim)
151
+ test_cache_time = torch.zeros(num_layers, batch_size, hidden_dim, cache_time_size)
152
+ test_cache_len = torch.zeros(batch_size, dtype=torch.int32)
153
+
154
+ print("Tracing model...")
155
+ traced_model = torch.jit.trace(
156
+ wrapper,
157
+ (test_mel, test_mel_len, test_pre_cache, test_cache_channel, test_cache_time, test_cache_len),
158
+ strict=False
159
+ )
160
+
161
+ # --- CoreML Conversion ---
162
+ print("Converting to CoreML...")
163
+
164
+ inputs = [
165
+ ct.TensorType(name="audio_signal", shape=(1, 128, chunk_size_in), dtype=np.float32),
166
+ ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
167
+ ct.TensorType(name="pre_cache", shape=(1, 128, pre_cache_size), dtype=np.float32),
168
+ ct.TensorType(name="cache_last_channel", shape=(num_layers, 1, cache_channel_size, hidden_dim), dtype=np.float32),
169
+ ct.TensorType(name="cache_last_time", shape=(num_layers, 1, hidden_dim, cache_time_size), dtype=np.float32),
170
+ ct.TensorType(name="cache_last_channel_len", shape=(1,), dtype=np.int32),
171
+ ]
172
+
173
+ outputs = [
174
+ ct.TensorType(name="encoded_output", dtype=np.float32),
175
+ ct.TensorType(name="encoded_length", dtype=np.int32),
176
+ ct.TensorType(name="new_pre_cache", dtype=np.float32),
177
+ ct.TensorType(name="new_cache_last_channel", dtype=np.float32),
178
+ ct.TensorType(name="new_cache_last_time", dtype=np.float32),
179
+ ct.TensorType(name="new_cache_last_channel_len", dtype=np.int32),
180
+ ]
181
+
182
+ mlmodel = _coreml_convert(traced_model, inputs, outputs)
183
+
184
+ save_path = output_path / "streaming_encoder.mlpackage"
185
+ mlmodel.save(str(save_path))
186
+ print(f"Saved: {save_path}")
187
+
188
+ # Also export Preprocessor, Decoder, Joint for completeness?
189
+ # For now, let's assume we reuse the existing ones or export them separately if needed.
190
+ # But the user asked specifically for the Encoder loopback.
191
+
192
+ if __name__ == "__main__":
193
+ main()
1280ms/decoder.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3996975a8cbc1949159c55605b3132b39b2484f51acbd55d796d93c70de02b49
3
+ size 243
1280ms/decoder.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3ccbff963d8cf07e2be2bd56ea3384a89ea49628922c6bd95ff62e2ae57dc34
3
+ size 497
1280ms/decoder.mlmodelc/metadata.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "shortDescription" : "Parakeet EOU decoder (RNNT prediction network)",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Float32",
10
+ "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
11
+ "shortDescription" : "",
12
+ "shape" : "[1, 640, 1]",
13
+ "name" : "decoder",
14
+ "type" : "MultiArray"
15
+ },
16
+ {
17
+ "hasShapeFlexibility" : "0",
18
+ "isOptional" : "0",
19
+ "dataType" : "Float32",
20
+ "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
21
+ "shortDescription" : "",
22
+ "shape" : "[1, 1, 640]",
23
+ "name" : "h_out",
24
+ "type" : "MultiArray"
25
+ },
26
+ {
27
+ "hasShapeFlexibility" : "0",
28
+ "isOptional" : "0",
29
+ "dataType" : "Float32",
30
+ "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
31
+ "shortDescription" : "",
32
+ "shape" : "[1, 1, 640]",
33
+ "name" : "c_out",
34
+ "type" : "MultiArray"
35
+ }
36
+ ],
37
+ "storagePrecision" : "Float16",
38
+ "modelParameters" : [
39
+
40
+ ],
41
+ "author" : "Fluid Inference",
42
+ "specificationVersion" : 8,
43
+ "mlProgramOperationTypeHistogram" : {
44
+ "Ios17.squeeze" : 2,
45
+ "Ios17.gather" : 1,
46
+ "Ios17.cast" : 6,
47
+ "Ios17.lstm" : 1,
48
+ "Ios17.transpose" : 2,
49
+ "Identity" : 1,
50
+ "Ios17.expandDims" : 2
51
+ },
52
+ "computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
53
+ "isUpdatable" : "0",
54
+ "stateSchema" : [
55
+
56
+ ],
57
+ "availability" : {
58
+ "macOS" : "14.0",
59
+ "tvOS" : "17.0",
60
+ "visionOS" : "1.0",
61
+ "watchOS" : "10.0",
62
+ "iOS" : "17.0",
63
+ "macCatalyst" : "17.0"
64
+ },
65
+ "modelType" : {
66
+ "name" : "MLModelType_mlProgram"
67
+ },
68
+ "inputSchema" : [
69
+ {
70
+ "hasShapeFlexibility" : "0",
71
+ "isOptional" : "0",
72
+ "dataType" : "Int32",
73
+ "formattedType" : "MultiArray (Int32 1 × 1)",
74
+ "shortDescription" : "",
75
+ "shape" : "[1, 1]",
76
+ "name" : "targets",
77
+ "type" : "MultiArray"
78
+ },
79
+ {
80
+ "hasShapeFlexibility" : "0",
81
+ "isOptional" : "0",
82
+ "dataType" : "Int32",
83
+ "formattedType" : "MultiArray (Int32 1)",
84
+ "shortDescription" : "",
85
+ "shape" : "[1]",
86
+ "name" : "target_length",
87
+ "type" : "MultiArray"
88
+ },
89
+ {
90
+ "hasShapeFlexibility" : "0",
91
+ "isOptional" : "0",
92
+ "dataType" : "Float32",
93
+ "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
94
+ "shortDescription" : "",
95
+ "shape" : "[1, 1, 640]",
96
+ "name" : "h_in",
97
+ "type" : "MultiArray"
98
+ },
99
+ {
100
+ "hasShapeFlexibility" : "0",
101
+ "isOptional" : "0",
102
+ "dataType" : "Float32",
103
+ "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
104
+ "shortDescription" : "",
105
+ "shape" : "[1, 1, 640]",
106
+ "name" : "c_in",
107
+ "type" : "MultiArray"
108
+ }
109
+ ],
110
+ "userDefinedMetadata" : {
111
+ "com.github.apple.coremltools.version" : "8.3.0",
112
+ "com.github.apple.coremltools.source" : "torch==2.4.0",
113
+ "com.github.apple.coremltools.source_dialect" : "TorchScript"
114
+ },
115
+ "generatedClassName" : "parakeet_eou_decoder",
116
+ "method" : "predict"
117
+ }
118
+ ]
1280ms/decoder.mlmodelc/model.mil ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
3
+ {
4
+ func main<ios17>(tensor<fp32, [1, 1, 640]> c_in, tensor<fp32, [1, 1, 640]> h_in, tensor<int32, [1]> target_length, tensor<int32, [1, 1]> targets) {
5
+ tensor<int32, []> y_axis_0 = const()[name = tensor<string, []>("y_axis_0"), val = tensor<int32, []>(0)];
6
+ tensor<int32, []> y_batch_dims_0 = const()[name = tensor<string, []>("y_batch_dims_0"), val = tensor<int32, []>(0)];
7
+ tensor<bool, []> y_validate_indices_0 = const()[name = tensor<string, []>("y_validate_indices_0"), val = tensor<bool, []>(false)];
8
+ tensor<fp16, [1027, 640]> module_prediction_embed_weight_to_fp16 = const()[name = tensor<string, []>("module_prediction_embed_weight_to_fp16"), val = tensor<fp16, [1027, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
9
+ tensor<string, []> targets_to_int16_dtype_0 = const()[name = tensor<string, []>("targets_to_int16_dtype_0"), val = tensor<string, []>("int16")];
10
+ tensor<int16, [1, 1]> targets_to_int16 = cast(dtype = targets_to_int16_dtype_0, x = targets)[name = tensor<string, []>("cast_8")];
11
+ tensor<fp16, [1, 1, 640]> y_cast_fp16_cast_uint16 = gather(axis = y_axis_0, batch_dims = y_batch_dims_0, indices = targets_to_int16, validate_indices = y_validate_indices_0, x = module_prediction_embed_weight_to_fp16)[name = tensor<string, []>("y_cast_fp16_cast_uint16")];
12
+ tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
13
+ tensor<int32, [1]> input_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
14
+ tensor<string, []> h_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("h_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
15
+ tensor<fp16, [1, 1, 640]> h_in_to_fp16 = cast(dtype = h_in_to_fp16_dtype_0, x = h_in)[name = tensor<string, []>("cast_7")];
16
+ tensor<fp16, [1, 640]> input_lstm_h0_squeeze_cast_fp16 = squeeze(axes = input_lstm_h0_squeeze_axes_0, x = h_in_to_fp16)[name = tensor<string, []>("input_lstm_h0_squeeze_cast_fp16")];
17
+ tensor<int32, [1]> input_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
18
+ tensor<string, []> c_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("c_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
19
+ tensor<fp16, [1, 1, 640]> c_in_to_fp16 = cast(dtype = c_in_to_fp16_dtype_0, x = c_in)[name = tensor<string, []>("cast_6")];
20
+ tensor<fp16, [1, 640]> input_lstm_c0_squeeze_cast_fp16 = squeeze(axes = input_lstm_c0_squeeze_axes_0, x = c_in_to_fp16)[name = tensor<string, []>("input_lstm_c0_squeeze_cast_fp16")];
21
+ tensor<string, []> input_direction_0 = const()[name = tensor<string, []>("input_direction_0"), val = tensor<string, []>("forward")];
22
+ tensor<bool, []> input_output_sequence_0 = const()[name = tensor<string, []>("input_output_sequence_0"), val = tensor<bool, []>(true)];
23
+ tensor<string, []> input_recurrent_activation_0 = const()[name = tensor<string, []>("input_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
24
+ tensor<string, []> input_cell_activation_0 = const()[name = tensor<string, []>("input_cell_activation_0"), val = tensor<string, []>("tanh")];
25
+ tensor<string, []> input_activation_0 = const()[name = tensor<string, []>("input_activation_0"), val = tensor<string, []>("tanh")];
26
+ tensor<fp16, [2560, 640]> concat_1_to_fp16 = const()[name = tensor<string, []>("concat_1_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1314688)))];
27
+ tensor<fp16, [2560, 640]> concat_2_to_fp16 = const()[name = tensor<string, []>("concat_2_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4591552)))];
28
+ tensor<fp16, [2560]> concat_0_to_fp16 = const()[name = tensor<string, []>("concat_0_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7868416)))];
29
+ tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = y_cast_fp16_cast_uint16)[name = tensor<string, []>("transpose_2")];
30
+ tensor<fp16, [1, 1, 640]> input_cast_fp16_0, tensor<fp16, [1, 640]> input_cast_fp16_1, tensor<fp16, [1, 640]> input_cast_fp16_2 = lstm(activation = input_activation_0, bias = concat_0_to_fp16, cell_activation = input_cell_activation_0, direction = input_direction_0, initial_c = input_lstm_c0_squeeze_cast_fp16, initial_h = input_lstm_h0_squeeze_cast_fp16, output_sequence = input_output_sequence_0, recurrent_activation = input_recurrent_activation_0, weight_hh = concat_2_to_fp16, weight_ih = concat_1_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
31
+ tensor<int32, [1]> obj_3_axes_0 = const()[name = tensor<string, []>("obj_3_axes_0"), val = tensor<int32, [1]>([0])];
32
+ tensor<fp16, [1, 1, 640]> obj_3_cast_fp16 = expand_dims(axes = obj_3_axes_0, x = input_cast_fp16_1)[name = tensor<string, []>("obj_3_cast_fp16")];
33
+ tensor<string, []> obj_3_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_3_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
34
+ tensor<int32, [1]> obj_axes_0 = const()[name = tensor<string, []>("obj_axes_0"), val = tensor<int32, [1]>([0])];
35
+ tensor<fp16, [1, 1, 640]> obj_cast_fp16 = expand_dims(axes = obj_axes_0, x = input_cast_fp16_2)[name = tensor<string, []>("obj_cast_fp16")];
36
+ tensor<string, []> obj_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
37
+ tensor<int32, [3]> transpose_0_perm_0 = const()[name = tensor<string, []>("transpose_0_perm_0"), val = tensor<int32, [3]>([1, 2, 0])];
38
+ tensor<string, []> transpose_0_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("transpose_0_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
39
+ tensor<fp16, [1, 640, 1]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = input_cast_fp16_0)[name = tensor<string, []>("transpose_1")];
40
+ tensor<fp32, [1, 640, 1]> decoder = cast(dtype = transpose_0_cast_fp16_to_fp32_dtype_0, x = transpose_0_cast_fp16)[name = tensor<string, []>("cast_3")];
41
+ tensor<fp32, [1, 1, 640]> c_out = cast(dtype = obj_cast_fp16_to_fp32_dtype_0, x = obj_cast_fp16)[name = tensor<string, []>("cast_4")];
42
+ tensor<fp32, [1, 1, 640]> h_out = cast(dtype = obj_3_cast_fp16_to_fp32_dtype_0, x = obj_3_cast_fp16)[name = tensor<string, []>("cast_5")];
43
+ tensor<int32, [1]> target_length_tmp = identity(x = target_length)[name = tensor<string, []>("target_length_tmp")];
44
+ } -> (decoder, h_out, c_out);
45
+ }
1280ms/decoder.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b4cacecdcd9df79ab1e56de67230baf5a8664d2afe0bb8f3408eefa972cb2f4
3
+ size 7873600
1280ms/individual_components.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Export Parakeet Realtime EOU 120M RNNT components into CoreML."""
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Optional, Tuple
8
+
9
+ import coremltools as ct
10
+ import torch
11
+
12
+
13
+ @dataclass
14
+ class ExportSettings:
15
+ output_dir: Path
16
+ compute_units: ct.ComputeUnit
17
+ deployment_target: Optional[ct.target]
18
+ compute_precision: Optional[ct.precision]
19
+ max_audio_seconds: float
20
+ max_symbol_steps: int
21
+
22
+
23
+ class PreprocessorWrapper(torch.nn.Module):
24
+ """Wrapper for the audio preprocessor (mel spectrogram extraction)."""
25
+
26
+ def __init__(self, module: torch.nn.Module) -> None:
27
+ super().__init__()
28
+ self.module = module
29
+
30
+ def forward(
31
+ self, audio_signal: torch.Tensor, length: torch.Tensor
32
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
33
+ mel, mel_length = self.module(
34
+ input_signal=audio_signal, length=length.to(dtype=torch.long)
35
+ )
36
+ return mel, mel_length
37
+
38
+
39
+ class EncoderWrapper(torch.nn.Module):
40
+ """Wrapper for the cache-aware FastConformer encoder.
41
+
42
+ Note: For the realtime EOU model, the encoder is cache-aware which means
43
+ it can operate in a streaming fashion. For CoreML export, we export
44
+ without cache state for simplicity (full-context mode).
45
+ """
46
+
47
+ def __init__(self, module: torch.nn.Module) -> None:
48
+ super().__init__()
49
+ self.module = module
50
+
51
+ def forward(
52
+ self, features: torch.Tensor, length: torch.Tensor
53
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
54
+ encoded, encoded_lengths = self.module(
55
+ audio_signal=features, length=length.to(dtype=torch.long)
56
+ )
57
+ # Synthesize per-frame timestamps (seconds) using the 80 ms encoder stride.
58
+ # Shape: [B, T_enc]
59
+ frame_times = (
60
+ torch.arange(encoded.shape[-1], device=encoded.device, dtype=torch.float32)
61
+ * 0.08
62
+ )
63
+ return encoded, encoded_lengths, frame_times
64
+
65
+
66
+ class DecoderWrapper(torch.nn.Module):
67
+ """Wrapper for the RNNT prediction network (decoder)."""
68
+
69
+ def __init__(self, module: torch.nn.Module) -> None:
70
+ super().__init__()
71
+ self.module = module
72
+
73
+ def forward(
74
+ self,
75
+ targets: torch.Tensor,
76
+ target_lengths: torch.Tensor,
77
+ h_in: torch.Tensor,
78
+ c_in: torch.Tensor,
79
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
80
+ state = [h_in, c_in]
81
+ decoder_output, _, new_state = self.module(
82
+ targets=targets.to(dtype=torch.long),
83
+ target_length=target_lengths.to(dtype=torch.long),
84
+ states=state,
85
+ )
86
+ return decoder_output, new_state[0], new_state[1]
87
+
88
+
89
+ class JointWrapper(torch.nn.Module):
90
+ """Wrapper for the RNNT joint network.
91
+
92
+ Note: Unlike Parakeet TDT v3, the realtime EOU model does NOT have
93
+ duration outputs (num_extra_outputs). The joint network outputs only
94
+ token logits over the vocabulary + blank.
95
+ """
96
+
97
+ def __init__(self, module: torch.nn.Module) -> None:
98
+ super().__init__()
99
+ self.module = module
100
+
101
+ def forward(
102
+ self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor
103
+ ) -> torch.Tensor:
104
+ # Input: encoder_outputs [B, D, T], decoder_outputs [B, D, U]
105
+ # Transpose to match what projection layers expect
106
+ encoder_outputs = encoder_outputs.transpose(1, 2) # [B, T, D]
107
+ decoder_outputs = decoder_outputs.transpose(1, 2) # [B, U, D]
108
+
109
+ # Apply projections
110
+ enc_proj = self.module.enc(encoder_outputs) # [B, T, joint_hidden]
111
+ dec_proj = self.module.pred(decoder_outputs) # [B, U, joint_hidden]
112
+
113
+ # Explicit broadcasting along T and U to avoid converter ambiguity
114
+ x = enc_proj.unsqueeze(2) + dec_proj.unsqueeze(1) # [B, T, U, joint_hidden]
115
+ x = self.module.joint_net[0](x) # ReLU
116
+ x = self.module.joint_net[1](x) # Dropout (no-op in eval)
117
+ out = self.module.joint_net[2](x) # Linear -> logits [B, T, U, vocab+blank]
118
+ return out
119
+
120
+
121
+ class MelEncoderWrapper(torch.nn.Module):
122
+ """Fused wrapper: waveform -> mel -> encoder.
123
+
124
+ Inputs:
125
+ - audio_signal: [B, S]
126
+ - audio_length: [B]
127
+
128
+ Outputs:
129
+ - encoder: [B, D, T_enc]
130
+ - encoder_length: [B]
131
+ - frame_times: [T_enc]
132
+ """
133
+
134
+ def __init__(
135
+ self, preprocessor: PreprocessorWrapper, encoder: EncoderWrapper
136
+ ) -> None:
137
+ super().__init__()
138
+ self.preprocessor = preprocessor
139
+ self.encoder = encoder
140
+
141
+ def forward(
142
+ self, audio_signal: torch.Tensor, audio_length: torch.Tensor
143
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
144
+ mel, mel_length = self.preprocessor(audio_signal, audio_length)
145
+ encoded, enc_len, frame_times = self.encoder(mel, mel_length.to(dtype=torch.int32))
146
+ return encoded, enc_len, frame_times
147
+
148
+
149
+ class JointDecisionWrapper(torch.nn.Module):
150
+ """Joint + decision head: outputs label id and label prob.
151
+
152
+ Unlike Parakeet TDT v3, this model does NOT have duration outputs.
153
+
154
+ Inputs:
155
+ - encoder_outputs: [B, D, T]
156
+ - decoder_outputs: [B, D, U]
157
+
158
+ Returns:
159
+ - token_id: [B, T, U] int32
160
+ - token_prob: [B, T, U] float32
161
+ """
162
+
163
+ def __init__(self, joint: JointWrapper, vocab_size: int) -> None:
164
+ super().__init__()
165
+ self.joint = joint
166
+ self.vocab_with_blank = int(vocab_size) + 1
167
+
168
+ def forward(self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor):
169
+ logits = self.joint(encoder_outputs, decoder_outputs)
170
+
171
+ # Token selection
172
+ token_ids = torch.argmax(logits, dim=-1).to(dtype=torch.int32)
173
+ token_probs_all = torch.softmax(logits, dim=-1)
174
+ # gather expects int64 (long) indices; cast only for gather
175
+ token_prob = torch.gather(
176
+ token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
177
+ ).squeeze(-1)
178
+
179
+ return token_ids, token_prob
180
+
181
+
182
+ class JointDecisionSingleStep(torch.nn.Module):
183
+ """Single-step variant for streaming: encoder_step -> token decision.
184
+
185
+ Inputs:
186
+ - encoder_step: [B=1, D, T=1]
187
+ - decoder_step: [B=1, D, U=1]
188
+
189
+ Returns:
190
+ - token_id: [1, 1, 1] int32
191
+ - token_prob: [1, 1, 1] float32
192
+ - top_k_ids: [1, 1, 1, K] int32
193
+ - top_k_logits: [1, 1, 1, K] float32
194
+ """
195
+
196
+ def __init__(self, joint: JointWrapper, vocab_size: int, top_k: int = 64) -> None:
197
+ super().__init__()
198
+ self.joint = joint
199
+ self.vocab_with_blank = int(vocab_size) + 1
200
+ self.top_k = int(top_k)
201
+
202
+ def forward(self, encoder_step: torch.Tensor, decoder_step: torch.Tensor):
203
+ # Reuse JointWrapper which expects [B, D, T] and [B, D, U]
204
+ logits = self.joint(encoder_step, decoder_step) # [1, 1, 1, V+blank]
205
+
206
+ token_ids = torch.argmax(logits, dim=-1, keepdim=False).to(dtype=torch.int32)
207
+ token_probs_all = torch.softmax(logits, dim=-1)
208
+ token_prob = torch.gather(
209
+ token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
210
+ ).squeeze(-1)
211
+
212
+ # Also expose top-K candidates for host-side processing
213
+ topk_logits, topk_ids_long = torch.topk(
214
+ logits, k=min(self.top_k, logits.shape[-1]), dim=-1
215
+ )
216
+ topk_ids = topk_ids_long.to(dtype=torch.int32)
217
+ return token_ids, token_prob, topk_ids, topk_logits
218
+
219
+
220
+ def _coreml_convert(
221
+ traced: torch.jit.ScriptModule,
222
+ inputs,
223
+ outputs,
224
+ settings: ExportSettings,
225
+ compute_units_override: Optional[ct.ComputeUnit] = None,
226
+ compute_precision: Optional[ct.precision] = None,
227
+ ) -> ct.models.MLModel:
228
+ cu = (
229
+ compute_units_override
230
+ if compute_units_override is not None
231
+ else settings.compute_units
232
+ )
233
+ kwargs = {
234
+ "convert_to": "mlprogram",
235
+ "inputs": inputs,
236
+ "outputs": outputs,
237
+ "compute_units": cu,
238
+ }
239
+ print("Converting:", traced.__class__.__name__)
240
+ print("Conversion kwargs:", kwargs)
241
+ if settings.deployment_target is not None:
242
+ kwargs["minimum_deployment_target"] = settings.deployment_target
243
+
244
+ # Priority: explicit argument > settings
245
+ if compute_precision is not None:
246
+ kwargs["compute_precision"] = compute_precision
247
+ elif settings.compute_precision is not None:
248
+ kwargs["compute_precision"] = settings.compute_precision
249
+
250
+ return ct.convert(traced, **kwargs)
1280ms/joint_decision.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bca32ad130dcad6605cc00044c752aa5b45ef57d14c17f2d1a2fa49d6cf55b5
3
+ size 243
1280ms/joint_decision.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22d4abc4625b935ee035b5f8ce7cb28d1041b9b01c12173e287bf4b5f5d99625
3
+ size 493
1280ms/joint_decision.mlmodelc/metadata.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "shortDescription" : "Parakeet EOU single-step joint decision",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Int32",
10
+ "formattedType" : "MultiArray (Int32 1 × 1 × 1)",
11
+ "shortDescription" : "",
12
+ "shape" : "[1, 1, 1]",
13
+ "name" : "token_id",
14
+ "type" : "MultiArray"
15
+ },
16
+ {
17
+ "hasShapeFlexibility" : "0",
18
+ "isOptional" : "0",
19
+ "dataType" : "Float32",
20
+ "formattedType" : "MultiArray (Float32 1 × 1 × 1)",
21
+ "shortDescription" : "",
22
+ "shape" : "[1, 1, 1]",
23
+ "name" : "token_prob",
24
+ "type" : "MultiArray"
25
+ },
26
+ {
27
+ "hasShapeFlexibility" : "0",
28
+ "isOptional" : "0",
29
+ "dataType" : "Int32",
30
+ "formattedType" : "MultiArray (Int32 1 × 1 × 1 × 64)",
31
+ "shortDescription" : "",
32
+ "shape" : "[1, 1, 1, 64]",
33
+ "name" : "top_k_ids",
34
+ "type" : "MultiArray"
35
+ },
36
+ {
37
+ "hasShapeFlexibility" : "0",
38
+ "isOptional" : "0",
39
+ "dataType" : "Float32",
40
+ "formattedType" : "MultiArray (Float32 1 × 1 × 1 × 64)",
41
+ "shortDescription" : "",
42
+ "shape" : "[1, 1, 1, 64]",
43
+ "name" : "top_k_logits",
44
+ "type" : "MultiArray"
45
+ }
46
+ ],
47
+ "storagePrecision" : "Float16",
48
+ "modelParameters" : [
49
+
50
+ ],
51
+ "author" : "Fluid Inference",
52
+ "specificationVersion" : 8,
53
+ "mlProgramOperationTypeHistogram" : {
54
+ "Ios17.reduceArgmax" : 1,
55
+ "Ios17.squeeze" : 1,
56
+ "Ios17.cast" : 6,
57
+ "Ios17.linear" : 3,
58
+ "Ios17.transpose" : 2,
59
+ "Ios17.add" : 1,
60
+ "Ios16.relu" : 1,
61
+ "Ios16.softmax" : 1,
62
+ "Ios17.gatherAlongAxis" : 1,
63
+ "Ios17.topk" : 1,
64
+ "Ios17.expandDims" : 3
65
+ },
66
+ "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
67
+ "isUpdatable" : "0",
68
+ "stateSchema" : [
69
+
70
+ ],
71
+ "availability" : {
72
+ "macOS" : "14.0",
73
+ "tvOS" : "17.0",
74
+ "visionOS" : "1.0",
75
+ "watchOS" : "10.0",
76
+ "iOS" : "17.0",
77
+ "macCatalyst" : "17.0"
78
+ },
79
+ "modelType" : {
80
+ "name" : "MLModelType_mlProgram"
81
+ },
82
+ "inputSchema" : [
83
+ {
84
+ "hasShapeFlexibility" : "0",
85
+ "isOptional" : "0",
86
+ "dataType" : "Float32",
87
+ "formattedType" : "MultiArray (Float32 1 × 512 × 1)",
88
+ "shortDescription" : "",
89
+ "shape" : "[1, 512, 1]",
90
+ "name" : "encoder_step",
91
+ "type" : "MultiArray"
92
+ },
93
+ {
94
+ "hasShapeFlexibility" : "0",
95
+ "isOptional" : "0",
96
+ "dataType" : "Float32",
97
+ "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
98
+ "shortDescription" : "",
99
+ "shape" : "[1, 640, 1]",
100
+ "name" : "decoder_step",
101
+ "type" : "MultiArray"
102
+ }
103
+ ],
104
+ "userDefinedMetadata" : {
105
+ "com.github.apple.coremltools.source_dialect" : "TorchScript",
106
+ "com.github.apple.coremltools.version" : "8.3.0",
107
+ "com.github.apple.coremltools.source" : "torch==2.4.0"
108
+ },
109
+ "generatedClassName" : "parakeet_eou_joint_decision_single_step",
110
+ "method" : "predict"
111
+ }
112
+ ]
1280ms/joint_decision.mlmodelc/model.mil ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
3
+ {
4
+ func main<ios17>(tensor<fp32, [1, 640, 1]> decoder_step, tensor<fp32, [1, 512, 1]> encoder_step) {
5
+ tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
6
+ tensor<string, []> encoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
7
+ tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
8
+ tensor<string, []> decoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
9
+ tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
10
+ tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
11
+ tensor<fp16, [1, 512, 1]> encoder_step_to_fp16 = cast(dtype = encoder_step_to_fp16_dtype_0, x = encoder_step)[name = tensor<string, []>("cast_8")];
12
+ tensor<fp16, [1, 1, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_step_to_fp16)[name = tensor<string, []>("transpose_1")];
13
+ tensor<fp16, [1, 1, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
14
+ tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
15
+ tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
16
+ tensor<fp16, [1, 640, 1]> decoder_step_to_fp16 = cast(dtype = decoder_step_to_fp16_dtype_0, x = decoder_step)[name = tensor<string, []>("cast_7")];
17
+ tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_step_to_fp16)[name = tensor<string, []>("transpose_0")];
18
+ tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
19
+ tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
20
+ tensor<fp16, [1, 1, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
21
+ tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
22
+ tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
23
+ tensor<fp16, [1, 1, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
24
+ tensor<fp16, [1, 1, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
25
+ tensor<fp16, [1027, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1027, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
26
+ tensor<fp16, [1027]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1027]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2792064)))];
27
+ tensor<fp16, [1, 1, 1, 1027]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
28
+ tensor<int32, []> var_38_axis_0 = const()[name = tensor<string, []>("op_38_axis_0"), val = tensor<int32, []>(-1)];
29
+ tensor<bool, []> var_38_keep_dims_0 = const()[name = tensor<string, []>("op_38_keep_dims_0"), val = tensor<bool, []>(false)];
30
+ tensor<string, []> var_38_output_dtype_0 = const()[name = tensor<string, []>("op_38_output_dtype_0"), val = tensor<string, []>("int32")];
31
+ tensor<int32, [1, 1, 1]> token_id = reduce_argmax(axis = var_38_axis_0, keep_dims = var_38_keep_dims_0, output_dtype = var_38_output_dtype_0, x = linear_2_cast_fp16)[name = tensor<string, []>("op_38_cast_fp16")];
32
+ tensor<int32, []> var_44 = const()[name = tensor<string, []>("op_44"), val = tensor<int32, []>(-1)];
33
+ tensor<fp16, [1, 1, 1, 1027]> token_probs_all_cast_fp16 = softmax(axis = var_44, x = linear_2_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
34
+ tensor<int32, [1]> var_53_axes_0 = const()[name = tensor<string, []>("op_53_axes_0"), val = tensor<int32, [1]>([-1])];
35
+ tensor<int32, [1, 1, 1, 1]> var_53 = expand_dims(axes = var_53_axes_0, x = token_id)[name = tensor<string, []>("op_53")];
36
+ tensor<int32, []> var_54 = const()[name = tensor<string, []>("op_54"), val = tensor<int32, []>(-1)];
37
+ tensor<bool, []> var_56_validate_indices_0 = const()[name = tensor<string, []>("op_56_validate_indices_0"), val = tensor<bool, []>(false)];
38
+ tensor<string, []> var_53_to_int16_dtype_0 = const()[name = tensor<string, []>("op_53_to_int16_dtype_0"), val = tensor<string, []>("int16")];
39
+ tensor<int16, [1, 1, 1, 1]> var_53_to_int16 = cast(dtype = var_53_to_int16_dtype_0, x = var_53)[name = tensor<string, []>("cast_6")];
40
+ tensor<fp16, [1, 1, 1, 1]> var_56_cast_fp16_cast_int16 = gather_along_axis(axis = var_54, indices = var_53_to_int16, validate_indices = var_56_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_56_cast_fp16_cast_int16")];
41
+ tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
42
+ tensor<fp16, [1, 1, 1]> var_58_cast_fp16 = squeeze(axes = var_58_axes_0, x = var_56_cast_fp16_cast_int16)[name = tensor<string, []>("op_58_cast_fp16")];
43
+ tensor<string, []> var_58_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_58_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
44
+ tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(64)];
45
+ tensor<int32, []> var_63_axis_0 = const()[name = tensor<string, []>("op_63_axis_0"), val = tensor<int32, []>(-1)];
46
+ tensor<bool, []> var_63_ascending_0 = const()[name = tensor<string, []>("op_63_ascending_0"), val = tensor<bool, []>(false)];
47
+ tensor<bool, []> var_63_sort_0 = const()[name = tensor<string, []>("op_63_sort_0"), val = tensor<bool, []>(true)];
48
+ tensor<bool, []> var_63_return_indices_0 = const()[name = tensor<string, []>("op_63_return_indices_0"), val = tensor<bool, []>(true)];
49
+ tensor<string, []> var_63_cast_fp16_cast_int16_output_indices_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_cast_int16_output_indices_dtype_0"), val = tensor<string, []>("uint16")];
50
+ tensor<fp16, [1, 1, 1, 64]> var_63_cast_fp16_cast_int16_0, tensor<uint16, [1, 1, 1, 64]> var_63_cast_fp16_cast_int16_1 = topk(ascending = var_63_ascending_0, axis = var_63_axis_0, k = var_59, output_indices_dtype = var_63_cast_fp16_cast_int16_output_indices_dtype_0, return_indices = var_63_return_indices_0, sort = var_63_sort_0, x = linear_2_cast_fp16)[name = tensor<string, []>("op_63_cast_fp16_cast_int16")];
51
+ tensor<string, []> var_63_cast_fp16_cast_int16_1_to_int32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_cast_int16_1_to_int32_dtype_0"), val = tensor<string, []>("int32")];
52
+ tensor<string, []> var_63_cast_fp16_0_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_0_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
53
+ tensor<fp32, [1, 1, 1, 64]> top_k_logits = cast(dtype = var_63_cast_fp16_0_to_fp32_dtype_0, x = var_63_cast_fp16_cast_int16_0)[name = tensor<string, []>("cast_3")];
54
+ tensor<int32, [1, 1, 1, 64]> top_k_ids = cast(dtype = var_63_cast_fp16_cast_int16_1_to_int32_dtype_0, x = var_63_cast_fp16_cast_int16_1)[name = tensor<string, []>("cast_4")];
55
+ tensor<fp32, [1, 1, 1]> token_prob = cast(dtype = var_58_cast_fp16_to_fp32_dtype_0, x = var_58_cast_fp16)[name = tensor<string, []>("cast_5")];
56
+ } -> (token_id, token_prob, top_k_ids, top_k_logits);
57
+ }
1280ms/joint_decision.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039b2010a269153f5a96edf28637f921a86ef8822f248f2d6712f7a6bce84b4
3
+ size 2794182
1280ms/parakeet_eou_preprocessor.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4ada8b0b99ac1d2ba7acbffacfbbf1a06cb69d30e9410d237ee0aa4c2b0ad63
3
+ size 243
1280ms/parakeet_eou_preprocessor.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc7252fa47622fe39577361233627062019a3bb740fdbb5366a7bae09df0ec5e
3
+ size 422
1280ms/parakeet_eou_preprocessor.mlmodelc/metadata.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "shortDescription" : "Parakeet EOU preprocessor",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Float32",
10
+ "formattedType" : "MultiArray (Float32)",
11
+ "shortDescription" : "",
12
+ "shape" : "[]",
13
+ "name" : "mel",
14
+ "type" : "MultiArray"
15
+ },
16
+ {
17
+ "hasShapeFlexibility" : "0",
18
+ "isOptional" : "0",
19
+ "dataType" : "Int32",
20
+ "formattedType" : "MultiArray (Int32 1)",
21
+ "shortDescription" : "",
22
+ "shape" : "[1]",
23
+ "name" : "mel_length",
24
+ "type" : "MultiArray"
25
+ }
26
+ ],
27
+ "storagePrecision" : "Float32",
28
+ "modelParameters" : [
29
+
30
+ ],
31
+ "author" : "Fluid Inference",
32
+ "specificationVersion" : 8,
33
+ "mlProgramOperationTypeHistogram" : {
34
+ "Range1d" : 1,
35
+ "Ios17.reshape" : 2,
36
+ "Identity" : 1,
37
+ "Ios17.matmul" : 1,
38
+ "Ios17.expandDims" : 5,
39
+ "Select" : 1,
40
+ "Ios17.add" : 3,
41
+ "Ios17.sliceByIndex" : 3,
42
+ "Ios16.reduceSum" : 1,
43
+ "Shape" : 1,
44
+ "Ios17.gather" : 1,
45
+ "Pad" : 1,
46
+ "Ios17.log" : 1,
47
+ "Ios17.conv" : 2,
48
+ "Ios17.sub" : 2,
49
+ "Ios17.pow" : 1,
50
+ "Ios17.cast" : 2,
51
+ "Stack" : 1,
52
+ "Ios17.concat" : 1,
53
+ "Ios17.floorDiv" : 1,
54
+ "Ios17.greaterEqual" : 1,
55
+ "Ios17.mul" : 1
56
+ },
57
+ "computePrecision" : "Mixed (Float32, Int32)",
58
+ "isUpdatable" : "0",
59
+ "stateSchema" : [
60
+
61
+ ],
62
+ "availability" : {
63
+ "macOS" : "14.0",
64
+ "tvOS" : "17.0",
65
+ "visionOS" : "1.0",
66
+ "watchOS" : "10.0",
67
+ "iOS" : "17.0",
68
+ "macCatalyst" : "17.0"
69
+ },
70
+ "modelType" : {
71
+ "name" : "MLModelType_mlProgram"
72
+ },
73
+ "inputSchema" : [
74
+ {
75
+ "dataType" : "Float32",
76
+ "hasShapeFlexibility" : "1",
77
+ "isOptional" : "0",
78
+ "shapeFlexibility" : "1 × 1...32000",
79
+ "shapeRange" : "[[1, 1], [1, 32000]]",
80
+ "formattedType" : "MultiArray (Float32 1 × 1)",
81
+ "type" : "MultiArray",
82
+ "shape" : "[1, 1]",
83
+ "name" : "audio_signal",
84
+ "shortDescription" : ""
85
+ },
86
+ {
87
+ "hasShapeFlexibility" : "0",
88
+ "isOptional" : "0",
89
+ "dataType" : "Int32",
90
+ "formattedType" : "MultiArray (Int32 1)",
91
+ "shortDescription" : "",
92
+ "shape" : "[1]",
93
+ "name" : "audio_length",
94
+ "type" : "MultiArray"
95
+ }
96
+ ],
97
+ "userDefinedMetadata" : {
98
+ "com.github.apple.coremltools.source_dialect" : "TorchScript",
99
+ "com.github.apple.coremltools.source" : "torch==2.4.0",
100
+ "com.github.apple.coremltools.version" : "8.3.0"
101
+ },
102
+ "generatedClassName" : "parakeet_eou_preprocessor",
103
+ "method" : "predict"
104
+ }
105
+ ]
1280ms/parakeet_eou_preprocessor.mlmodelc/model.mil ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
3
+ {
4
+ func main<ios17>(tensor<int32, [1]> audio_length, tensor<fp32, [1, ?]> audio_signal) [FlexibleShapeInformation = tuple<tuple<tensor<string, []>, dict<tensor<string, []>, tensor<int32, [?]>>>, tuple<tensor<string, []>, dict<tensor<string, []>, list<tensor<int32, [2]>, ?>>>>((("DefaultShapes", {{"audio_signal", [1, 1]}}), ("RangeDims", {{"audio_signal", [[1, 1], [1, 32000]]}})))] {
5
+ tensor<int32, []> var_9 = const()[name = tensor<string, []>("op_9"), val = tensor<int32, []>(1)];
6
+ tensor<int32, []> var_10 = const()[name = tensor<string, []>("op_10"), val = tensor<int32, []>(160)];
7
+ tensor<int32, []> var_32 = const()[name = tensor<string, []>("op_32"), val = tensor<int32, []>(512)];
8
+ tensor<int32, [1]> var_33 = add(x = audio_length, y = var_32)[name = tensor<string, []>("op_33")];
9
+ tensor<int32, []> var_34 = const()[name = tensor<string, []>("op_34"), val = tensor<int32, []>(512)];
10
+ tensor<int32, [1]> var_35 = sub(x = var_33, y = var_34)[name = tensor<string, []>("op_35")];
11
+ tensor<int32, [1]> floor_div_0 = floor_div(x = var_35, y = var_10)[name = tensor<string, []>("floor_div_0")];
12
+ tensor<string, []> var_36_dtype_0 = const()[name = tensor<string, []>("op_36_dtype_0"), val = tensor<string, []>("fp32")];
13
+ tensor<fp32, []> var_37_promoted = const()[name = tensor<string, []>("op_37_promoted"), val = tensor<fp32, []>(0x1p+0)];
14
+ tensor<fp32, [1]> var_36 = cast(dtype = var_36_dtype_0, x = floor_div_0)[name = tensor<string, []>("cast_11")];
15
+ tensor<fp32, [1]> seq_len_1 = add(x = var_36, y = var_37_promoted)[name = tensor<string, []>("seq_len_1")];
16
+ tensor<string, []> cast_2_dtype_0 = const()[name = tensor<string, []>("cast_2_dtype_0"), val = tensor<string, []>("int32")];
17
+ tensor<int32, [2]> var_41_begin_0 = const()[name = tensor<string, []>("op_41_begin_0"), val = tensor<int32, [2]>([0, 0])];
18
+ tensor<int32, [2]> var_41_end_0 = const()[name = tensor<string, []>("op_41_end_0"), val = tensor<int32, [2]>([1, 1])];
19
+ tensor<bool, [2]> var_41_end_mask_0 = const()[name = tensor<string, []>("op_41_end_mask_0"), val = tensor<bool, [2]>([true, false])];
20
+ tensor<bool, [2]> var_41_squeeze_mask_0 = const()[name = tensor<string, []>("op_41_squeeze_mask_0"), val = tensor<bool, [2]>([false, true])];
21
+ tensor<fp32, [1]> var_41 = slice_by_index(begin = var_41_begin_0, end = var_41_end_0, end_mask = var_41_end_mask_0, squeeze_mask = var_41_squeeze_mask_0, x = audio_signal)[name = tensor<string, []>("op_41")];
22
+ tensor<int32, [1]> var_42_axes_0 = const()[name = tensor<string, []>("op_42_axes_0"), val = tensor<int32, [1]>([1])];
23
+ tensor<fp32, [1, 1]> var_42 = expand_dims(axes = var_42_axes_0, x = var_41)[name = tensor<string, []>("op_42")];
24
+ tensor<int32, [2]> var_44_begin_0 = const()[name = tensor<string, []>("op_44_begin_0"), val = tensor<int32, [2]>([0, 1])];
25
+ tensor<int32, [2]> var_44_end_0 = const()[name = tensor<string, []>("op_44_end_0"), val = tensor<int32, [2]>([1, 0])];
26
+ tensor<bool, [2]> var_44_end_mask_0 = const()[name = tensor<string, []>("op_44_end_mask_0"), val = tensor<bool, [2]>([true, true])];
27
+ tensor<fp32, [1, ?]> var_44 = slice_by_index(begin = var_44_begin_0, end = var_44_end_0, end_mask = var_44_end_mask_0, x = audio_signal)[name = tensor<string, []>("op_44")];
28
+ tensor<int32, [2]> var_46_begin_0 = const()[name = tensor<string, []>("op_46_begin_0"), val = tensor<int32, [2]>([0, 0])];
29
+ tensor<int32, [2]> var_46_end_0 = const()[name = tensor<string, []>("op_46_end_0"), val = tensor<int32, [2]>([1, -1])];
30
+ tensor<bool, [2]> var_46_end_mask_0 = const()[name = tensor<string, []>("op_46_end_mask_0"), val = tensor<bool, [2]>([true, false])];
31
+ tensor<fp32, [1, ?]> var_46 = slice_by_index(begin = var_46_begin_0, end = var_46_end_0, end_mask = var_46_end_mask_0, x = audio_signal)[name = tensor<string, []>("op_46")];
32
+ tensor<fp32, []> var_47 = const()[name = tensor<string, []>("op_47"), val = tensor<fp32, []>(0x1.f0a3d8p-1)];
33
+ tensor<fp32, [1, ?]> var_48 = mul(x = var_46, y = var_47)[name = tensor<string, []>("op_48")];
34
+ tensor<fp32, [1, ?]> var_49 = sub(x = var_44, y = var_48)[name = tensor<string, []>("op_49")];
35
+ tensor<bool, []> input_1_interleave_0 = const()[name = tensor<string, []>("input_1_interleave_0"), val = tensor<bool, []>(false)];
36
+ tensor<fp32, [1, ?]> input_1 = concat(axis = var_9, interleave = input_1_interleave_0, values = (var_42, var_49))[name = tensor<string, []>("input_1")];
37
+ tensor<int32, [3]> concat_0x = const()[name = tensor<string, []>("concat_0x"), val = tensor<int32, [3]>([1, 1, -1])];
38
+ tensor<fp32, [1, 1, ?]> input_3 = reshape(shape = concat_0x, x = input_1)[name = tensor<string, []>("input_3")];
39
+ tensor<fp32, []> const_1 = const()[name = tensor<string, []>("const_1"), val = tensor<fp32, []>(0x0p+0)];
40
+ tensor<int32, [6]> input_5_pad_0 = const()[name = tensor<string, []>("input_5_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 256, 256])];
41
+ tensor<string, []> input_5_mode_0 = const()[name = tensor<string, []>("input_5_mode_0"), val = tensor<string, []>("reflect")];
42
+ tensor<fp32, [1, 1, ?]> input_5 = pad(constant_val = const_1, mode = input_5_mode_0, pad = input_5_pad_0, x = input_3)[name = tensor<string, []>("input_5")];
43
+ tensor<int32, [2]> concat_1x = const()[name = tensor<string, []>("concat_1x"), val = tensor<int32, [2]>([1, -1])];
44
+ tensor<fp32, [1, ?]> input = reshape(shape = concat_1x, x = input_5)[name = tensor<string, []>("input")];
45
+ tensor<fp32, [257, 1, 512]> expand_dims_1 = const()[name = tensor<string, []>("expand_dims_1"), val = tensor<fp32, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
46
+ tensor<fp32, [257, 1, 512]> expand_dims_2 = const()[name = tensor<string, []>("expand_dims_2"), val = tensor<fp32, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(526464)))];
47
+ tensor<int32, [1]> expand_dims_3 = const()[name = tensor<string, []>("expand_dims_3"), val = tensor<int32, [1]>([160])];
48
+ tensor<int32, [1]> expand_dims_4_axes_0 = const()[name = tensor<string, []>("expand_dims_4_axes_0"), val = tensor<int32, [1]>([1])];
49
+ tensor<fp32, [1, 1, ?]> expand_dims_4 = expand_dims(axes = expand_dims_4_axes_0, x = input)[name = tensor<string, []>("expand_dims_4")];
50
+ tensor<string, []> conv_0_pad_type_0 = const()[name = tensor<string, []>("conv_0_pad_type_0"), val = tensor<string, []>("valid")];
51
+ tensor<int32, [2]> conv_0_pad_0 = const()[name = tensor<string, []>("conv_0_pad_0"), val = tensor<int32, [2]>([0, 0])];
52
+ tensor<int32, [1]> conv_0_dilations_0 = const()[name = tensor<string, []>("conv_0_dilations_0"), val = tensor<int32, [1]>([1])];
53
+ tensor<int32, []> conv_0_groups_0 = const()[name = tensor<string, []>("conv_0_groups_0"), val = tensor<int32, []>(1)];
54
+ tensor<fp32, [1, 257, ?]> conv_0 = conv(dilations = conv_0_dilations_0, groups = conv_0_groups_0, pad = conv_0_pad_0, pad_type = conv_0_pad_type_0, strides = expand_dims_3, weight = expand_dims_1, x = expand_dims_4)[name = tensor<string, []>("conv_0")];
55
+ tensor<string, []> conv_1_pad_type_0 = const()[name = tensor<string, []>("conv_1_pad_type_0"), val = tensor<string, []>("valid")];
56
+ tensor<int32, [2]> conv_1_pad_0 = const()[name = tensor<string, []>("conv_1_pad_0"), val = tensor<int32, [2]>([0, 0])];
57
+ tensor<int32, [1]> conv_1_dilations_0 = const()[name = tensor<string, []>("conv_1_dilations_0"), val = tensor<int32, [1]>([1])];
58
+ tensor<int32, []> conv_1_groups_0 = const()[name = tensor<string, []>("conv_1_groups_0"), val = tensor<int32, []>(1)];
59
+ tensor<fp32, [1, 257, ?]> conv_1 = conv(dilations = conv_1_dilations_0, groups = conv_1_groups_0, pad = conv_1_pad_0, pad_type = conv_1_pad_type_0, strides = expand_dims_3, weight = expand_dims_2, x = expand_dims_4)[name = tensor<string, []>("conv_1")];
60
+ tensor<int32, []> stack_0_axis_0 = const()[name = tensor<string, []>("stack_0_axis_0"), val = tensor<int32, []>(-1)];
61
+ tensor<fp32, [1, 257, ?, 2]> stack_0 = stack(axis = stack_0_axis_0, values = (conv_0, conv_1))[name = tensor<string, []>("stack_0")];
62
+ tensor<fp32, []> var_17_promoted = const()[name = tensor<string, []>("op_17_promoted"), val = tensor<fp32, []>(0x1p+1)];
63
+ tensor<fp32, [1, 257, ?, 2]> var_65 = pow(x = stack_0, y = var_17_promoted)[name = tensor<string, []>("op_65")];
64
+ tensor<int32, [1]> var_67_axes_0 = const()[name = tensor<string, []>("op_67_axes_0"), val = tensor<int32, [1]>([-1])];
65
+ tensor<bool, []> var_67_keep_dims_0 = const()[name = tensor<string, []>("op_67_keep_dims_0"), val = tensor<bool, []>(false)];
66
+ tensor<fp32, [1, 257, ?]> var_67 = reduce_sum(axes = var_67_axes_0, keep_dims = var_67_keep_dims_0, x = var_65)[name = tensor<string, []>("op_67")];
67
+ tensor<fp32, [1, 257, ?]> x_9 = identity(x = var_67)[name = tensor<string, []>("x_9")];
68
+ tensor<fp32, [1, 128, 257]> const_2 = const()[name = tensor<string, []>("const_2"), val = tensor<fp32, [1, 128, 257]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1052864)))];
69
+ tensor<bool, []> x_11_transpose_x_0 = const()[name = tensor<string, []>("x_11_transpose_x_0"), val = tensor<bool, []>(false)];
70
+ tensor<bool, []> x_11_transpose_y_0 = const()[name = tensor<string, []>("x_11_transpose_y_0"), val = tensor<bool, []>(false)];
71
+ tensor<fp32, [1, 128, ?]> x_11 = matmul(transpose_x = x_11_transpose_x_0, transpose_y = x_11_transpose_y_0, x = const_2, y = x_9)[name = tensor<string, []>("x_11")];
72
+ tensor<fp32, []> var_74 = const()[name = tensor<string, []>("op_74"), val = tensor<fp32, []>(0x1p-24)];
73
+ tensor<fp32, [1, 128, ?]> var_75 = add(x = x_11, y = var_74)[name = tensor<string, []>("op_75")];
74
+ tensor<fp32, []> x_epsilon_0 = const()[name = tensor<string, []>("x_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
75
+ tensor<fp32, [1, 128, ?]> x = log(epsilon = x_epsilon_0, x = var_75)[name = tensor<string, []>("x")];
76
+ tensor<int32, [3]> var_77_shape = shape(x = x)[name = tensor<string, []>("op_77_shape")];
77
+ tensor<int32, []> select_4 = const()[name = tensor<string, []>("select_4"), val = tensor<int32, []>(2)];
78
+ tensor<int32, []> gather_4_axis_0 = const()[name = tensor<string, []>("gather_4_axis_0"), val = tensor<int32, []>(0)];
79
+ tensor<int32, []> gather_4_batch_dims_0 = const()[name = tensor<string, []>("gather_4_batch_dims_0"), val = tensor<int32, []>(0)];
80
+ tensor<bool, []> gather_4_validate_indices_0 = const()[name = tensor<string, []>("gather_4_validate_indices_0"), val = tensor<bool, []>(false)];
81
+ tensor<int32, []> gather_4 = gather(axis = gather_4_axis_0, batch_dims = gather_4_batch_dims_0, indices = select_4, validate_indices = gather_4_validate_indices_0, x = var_77_shape)[name = tensor<string, []>("gather_4")];
82
+ tensor<int32, []> const_3 = const()[name = tensor<string, []>("const_3"), val = tensor<int32, []>(0)];
83
+ tensor<int32, []> const_4 = const()[name = tensor<string, []>("const_4"), val = tensor<int32, []>(1)];
84
+ tensor<int32, [?]> mask_1 = range_1d(end = gather_4, start = const_3, step = const_4)[name = tensor<string, []>("mask_1")];
85
+ tensor<int32, [1]> expand_dims_0_axes_0 = const()[name = tensor<string, []>("expand_dims_0_axes_0"), val = tensor<int32, [1]>([0])];
86
+ tensor<int32, [1, ?]> expand_dims_0 = expand_dims(axes = expand_dims_0_axes_0, x = mask_1)[name = tensor<string, []>("expand_dims_0")];
87
+ tensor<int32, [1]> var_82_axes_0 = const()[name = tensor<string, []>("op_82_axes_0"), val = tensor<int32, [1]>([1])];
88
+ tensor<int32, [1]> mel_length = cast(dtype = cast_2_dtype_0, x = seq_len_1)[name = tensor<string, []>("cast_10")];
89
+ tensor<int32, [1, 1]> var_82 = expand_dims(axes = var_82_axes_0, x = mel_length)[name = tensor<string, []>("op_82")];
90
+ tensor<bool, [1, ?]> mask = greater_equal(x = expand_dims_0, y = var_82)[name = tensor<string, []>("mask")];
91
+ tensor<int32, [1]> var_84_axes_0 = const()[name = tensor<string, []>("op_84_axes_0"), val = tensor<int32, [1]>([1])];
92
+ tensor<bool, [1, 1, ?]> var_84 = expand_dims(axes = var_84_axes_0, x = mask)[name = tensor<string, []>("op_84")];
93
+ tensor<fp32, []> cast_7 = const()[name = tensor<string, []>("cast_7"), val = tensor<fp32, []>(0x0p+0)];
94
+ tensor<fp32, [1, 128, ?]> mel = select(a = cast_7, b = x, cond = var_84)[name = tensor<string, []>("processed_signal")];
95
+ } -> (mel, mel_length);
96
+ }
1280ms/parakeet_eou_preprocessor.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:009bba4fde82dc55db9b55d77cf3ba5f791ce366c49f079285fe25a3b6e2291d
3
+ size 1184512
1280ms/streaming_encoder.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0a3c84022a9d2dc769d38cf8f45e93423e20734d092e3c16db11fbf6dca4004
3
+ size 243
1280ms/streaming_encoder.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41ce3f96c3d6b3333796fc4ed82cb0c9b4ea99396b88f8eec3ba24394ba2bb78
3
+ size 671
1280ms/streaming_encoder.mlmodelc/metadata.json ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "storagePrecision" : "Float16",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Float32",
10
+ "formattedType" : "MultiArray (Float32 1 × 512 × 17)",
11
+ "shortDescription" : "",
12
+ "shape" : "[1, 512, 17]",
13
+ "name" : "encoded_output",
14
+ "type" : "MultiArray"
15
+ },
16
+ {
17
+ "hasShapeFlexibility" : "0",
18
+ "isOptional" : "0",
19
+ "dataType" : "Int32",
20
+ "formattedType" : "MultiArray (Int32 1)",
21
+ "shortDescription" : "",
22
+ "shape" : "[1]",
23
+ "name" : "encoded_length",
24
+ "type" : "MultiArray"
25
+ },
26
+ {
27
+ "hasShapeFlexibility" : "0",
28
+ "isOptional" : "0",
29
+ "dataType" : "Float32",
30
+ "formattedType" : "MultiArray (Float32 1 × 128 × 16)",
31
+ "shortDescription" : "",
32
+ "shape" : "[1, 128, 16]",
33
+ "name" : "new_pre_cache",
34
+ "type" : "MultiArray"
35
+ },
36
+ {
37
+ "hasShapeFlexibility" : "0",
38
+ "isOptional" : "0",
39
+ "dataType" : "Float32",
40
+ "formattedType" : "MultiArray (Float32 17 × 1 × 70 × 512)",
41
+ "shortDescription" : "",
42
+ "shape" : "[17, 1, 70, 512]",
43
+ "name" : "new_cache_last_channel",
44
+ "type" : "MultiArray"
45
+ },
46
+ {
47
+ "hasShapeFlexibility" : "0",
48
+ "isOptional" : "0",
49
+ "dataType" : "Float32",
50
+ "formattedType" : "MultiArray (Float32 17 × 1 × 512 × 8)",
51
+ "shortDescription" : "",
52
+ "shape" : "[17, 1, 512, 8]",
53
+ "name" : "new_cache_last_time",
54
+ "type" : "MultiArray"
55
+ },
56
+ {
57
+ "hasShapeFlexibility" : "0",
58
+ "isOptional" : "0",
59
+ "dataType" : "Int32",
60
+ "formattedType" : "MultiArray (Int32 1)",
61
+ "shortDescription" : "",
62
+ "shape" : "[1]",
63
+ "name" : "new_cache_last_channel_len",
64
+ "type" : "MultiArray"
65
+ }
66
+ ],
67
+ "modelParameters" : [
68
+
69
+ ],
70
+ "specificationVersion" : 8,
71
+ "mlProgramOperationTypeHistogram" : {
72
+ "Ios17.floor" : 3,
73
+ "Ios17.logicalAnd" : 3,
74
+ "Ios17.reshape" : 103,
75
+ "Ios16.softmax" : 17,
76
+ "Ios17.matmul" : 51,
77
+ "Ios17.transpose" : 157,
78
+ "Split" : 17,
79
+ "Ios17.expandDims" : 6,
80
+ "Select" : 51,
81
+ "Ios17.add" : 126,
82
+ "Tile" : 1,
83
+ "Ios17.sliceByIndex" : 106,
84
+ "Ios16.sigmoid" : 17,
85
+ "Pad" : 20,
86
+ "Ios17.logicalNot" : 2,
87
+ "Ios17.layerNorm" : 102,
88
+ "Ios17.less" : 1,
89
+ "Ios17.sub" : 1,
90
+ "Ios17.conv" : 56,
91
+ "Ios17.clip" : 2,
92
+ "Ios16.relu" : 3,
93
+ "Ios17.linear" : 137,
94
+ "Ios17.concat" : 52,
95
+ "Ios17.greaterEqual" : 1,
96
+ "Ios17.cast" : 14,
97
+ "Ios16.silu" : 51,
98
+ "Stack" : 2,
99
+ "Ios17.mul" : 72
100
+ },
101
+ "computePrecision" : "Mixed (Float16, Float32, Int32)",
102
+ "isUpdatable" : "0",
103
+ "stateSchema" : [
104
+
105
+ ],
106
+ "availability" : {
107
+ "macOS" : "14.0",
108
+ "tvOS" : "17.0",
109
+ "visionOS" : "1.0",
110
+ "watchOS" : "10.0",
111
+ "iOS" : "17.0",
112
+ "macCatalyst" : "17.0"
113
+ },
114
+ "modelType" : {
115
+ "name" : "MLModelType_mlProgram"
116
+ },
117
+ "userDefinedMetadata" : {
118
+ "com.github.apple.coremltools.source_dialect" : "TorchScript",
119
+ "com.github.apple.coremltools.source" : "torch==2.4.0",
120
+ "com.github.apple.coremltools.version" : "8.3.0"
121
+ },
122
+ "inputSchema" : [
123
+ {
124
+ "hasShapeFlexibility" : "0",
125
+ "isOptional" : "0",
126
+ "dataType" : "Float32",
127
+ "formattedType" : "MultiArray (Float32 1 × 128 × 129)",
128
+ "shortDescription" : "",
129
+ "shape" : "[1, 128, 129]",
130
+ "name" : "audio_signal",
131
+ "type" : "MultiArray"
132
+ },
133
+ {
134
+ "hasShapeFlexibility" : "0",
135
+ "isOptional" : "0",
136
+ "dataType" : "Int32",
137
+ "formattedType" : "MultiArray (Int32 1)",
138
+ "shortDescription" : "",
139
+ "shape" : "[1]",
140
+ "name" : "audio_length",
141
+ "type" : "MultiArray"
142
+ },
143
+ {
144
+ "hasShapeFlexibility" : "0",
145
+ "isOptional" : "0",
146
+ "dataType" : "Float32",
147
+ "formattedType" : "MultiArray (Float32 1 × 128 × 16)",
148
+ "shortDescription" : "",
149
+ "shape" : "[1, 128, 16]",
150
+ "name" : "pre_cache",
151
+ "type" : "MultiArray"
152
+ },
153
+ {
154
+ "hasShapeFlexibility" : "0",
155
+ "isOptional" : "0",
156
+ "dataType" : "Float32",
157
+ "formattedType" : "MultiArray (Float32 17 × 1 × 70 × 512)",
158
+ "shortDescription" : "",
159
+ "shape" : "[17, 1, 70, 512]",
160
+ "name" : "cache_last_channel",
161
+ "type" : "MultiArray"
162
+ },
163
+ {
164
+ "hasShapeFlexibility" : "0",
165
+ "isOptional" : "0",
166
+ "dataType" : "Float32",
167
+ "formattedType" : "MultiArray (Float32 17 × 1 × 512 × 8)",
168
+ "shortDescription" : "",
169
+ "shape" : "[17, 1, 512, 8]",
170
+ "name" : "cache_last_time",
171
+ "type" : "MultiArray"
172
+ },
173
+ {
174
+ "hasShapeFlexibility" : "0",
175
+ "isOptional" : "0",
176
+ "dataType" : "Int32",
177
+ "formattedType" : "MultiArray (Int32 1)",
178
+ "shortDescription" : "",
179
+ "shape" : "[1]",
180
+ "name" : "cache_last_channel_len",
181
+ "type" : "MultiArray"
182
+ }
183
+ ],
184
+ "generatedClassName" : "streaming_encoder",
185
+ "method" : "predict"
186
+ }
187
+ ]
1280ms/streaming_encoder.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
1280ms/streaming_encoder.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c71acb590ceb2af449de5c7e3516e76057eaf4589d1f16edba774831db74b17
3
+ size 213179200
1280ms/streaming_encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9181d223091aa592cb656d49346e640a38ec2426de5ec2d06edbc14e92b8968b
3
+ size 508252
1280ms/streaming_encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c71acb590ceb2af449de5c7e3516e76057eaf4589d1f16edba774831db74b17
3
+ size 213179200
1280ms/streaming_encoder.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "468B5E19-6BA9-478C-8D2A-23953ACBD5E3": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "F22DE286-FE1A-4BBD-A7ED-B0130595DAF3": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "468B5E19-6BA9-478C-8D2A-23953ACBD5E3"
18
+ }
1280ms/vocab.json ADDED
@@ -0,0 +1,1028 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": "<unk>",
3
+ "1": "▁t",
4
+ "2": "▁th",
5
+ "3": "▁a",
6
+ "4": "▁i",
7
+ "5": "▁the",
8
+ "6": "▁s",
9
+ "7": "re",
10
+ "8": "▁w",
11
+ "9": "▁o",
12
+ "10": "in",
13
+ "11": "at",
14
+ "12": "er",
15
+ "13": "nd",
16
+ "14": "ou",
17
+ "15": "▁c",
18
+ "16": "▁b",
19
+ "17": "▁h",
20
+ "18": "en",
21
+ "19": "on",
22
+ "20": "▁m",
23
+ "21": "▁f",
24
+ "22": "ing",
25
+ "23": "▁p",
26
+ "24": "▁to",
27
+ "25": "▁and",
28
+ "26": "▁d",
29
+ "27": "an",
30
+ "28": "or",
31
+ "29": "es",
32
+ "30": "▁y",
33
+ "31": "▁l",
34
+ "32": "▁of",
35
+ "33": "ll",
36
+ "34": "▁in",
37
+ "35": "ed",
38
+ "36": "it",
39
+ "37": "▁g",
40
+ "38": "is",
41
+ "39": "▁you",
42
+ "40": "▁n",
43
+ "41": "ar",
44
+ "42": "om",
45
+ "43": "as",
46
+ "44": "ve",
47
+ "45": "▁e",
48
+ "46": "ic",
49
+ "47": "▁it",
50
+ "48": "al",
51
+ "49": "us",
52
+ "50": "▁wh",
53
+ "51": "▁we",
54
+ "52": "▁be",
55
+ "53": "ion",
56
+ "54": "ow",
57
+ "55": "le",
58
+ "56": "▁is",
59
+ "57": "et",
60
+ "58": "ent",
61
+ "59": "ot",
62
+ "60": "ut",
63
+ "61": "▁re",
64
+ "62": "▁on",
65
+ "63": "ay",
66
+ "64": "▁ha",
67
+ "65": "ig",
68
+ "66": "▁so",
69
+ "67": "ct",
70
+ "68": "▁he",
71
+ "69": "▁for",
72
+ "70": "ver",
73
+ "71": "ke",
74
+ "72": "ro",
75
+ "73": "▁st",
76
+ "74": "id",
77
+ "75": "▁go",
78
+ "76": "all",
79
+ "77": "se",
80
+ "78": "ly",
81
+ "79": "▁u",
82
+ "80": "ch",
83
+ "81": "st",
84
+ "82": "ld",
85
+ "83": "▁k",
86
+ "84": "ce",
87
+ "85": "ur",
88
+ "86": "▁li",
89
+ "87": "am",
90
+ "88": "▁r",
91
+ "89": "ht",
92
+ "90": "▁j",
93
+ "91": "ith",
94
+ "92": "▁se",
95
+ "93": "ir",
96
+ "94": "▁as",
97
+ "95": "▁an",
98
+ "96": "im",
99
+ "97": "▁do",
100
+ "98": "ad",
101
+ "99": "▁was",
102
+ "100": "ight",
103
+ "101": "th",
104
+ "102": "▁are",
105
+ "103": "▁but",
106
+ "104": "▁sh",
107
+ "105": "ust",
108
+ "106": "ally",
109
+ "107": "▁not",
110
+ "108": "▁or",
111
+ "109": "▁com",
112
+ "110": "▁can",
113
+ "111": "▁me",
114
+ "112": "op",
115
+ "113": "▁mo",
116
+ "114": "▁at",
117
+ "115": "ill",
118
+ "116": "▁ch",
119
+ "117": "▁ne",
120
+ "118": "ant",
121
+ "119": "▁de",
122
+ "120": "▁kn",
123
+ "121": "▁one",
124
+ "122": "il",
125
+ "123": "ol",
126
+ "124": "▁con",
127
+ "125": "ter",
128
+ "126": "▁ab",
129
+ "127": "▁fr",
130
+ "128": "ere",
131
+ "129": "ck",
132
+ "130": "▁al",
133
+ "131": "▁all",
134
+ "132": "qu",
135
+ "133": "▁pro",
136
+ "134": "▁som",
137
+ "135": "ould",
138
+ "136": "▁tw",
139
+ "137": "ul",
140
+ "138": "ra",
141
+ "139": "od",
142
+ "140": "ers",
143
+ "141": "▁su",
144
+ "142": "ive",
145
+ "143": "▁v",
146
+ "144": "use",
147
+ "145": "ate",
148
+ "146": "ge",
149
+ "147": "if",
150
+ "148": "▁ex",
151
+ "149": "ess",
152
+ "150": "pp",
153
+ "151": "▁lo",
154
+ "152": "out",
155
+ "153": "▁if",
156
+ "154": "est",
157
+ "155": "ain",
158
+ "156": "ist",
159
+ "157": "and",
160
+ "158": "ea",
161
+ "159": "very",
162
+ "160": "art",
163
+ "161": "▁wor",
164
+ "162": "▁my",
165
+ "163": "ab",
166
+ "164": "ment",
167
+ "165": "▁bec",
168
+ "166": "un",
169
+ "167": "ity",
170
+ "168": "ri",
171
+ "169": "pe",
172
+ "170": "ions",
173
+ "171": "▁by",
174
+ "172": "ok",
175
+ "173": "our",
176
+ "174": "ort",
177
+ "175": "ind",
178
+ "176": "ink",
179
+ "177": "nt",
180
+ "178": "▁up",
181
+ "179": "um",
182
+ "180": "▁don",
183
+ "181": "▁get",
184
+ "182": "red",
185
+ "183": "▁out",
186
+ "184": "el",
187
+ "185": "ause",
188
+ "186": "res",
189
+ "187": "▁ma",
190
+ "188": "ich",
191
+ "189": "▁us",
192
+ "190": "rou",
193
+ "191": "▁int",
194
+ "192": "em",
195
+ "193": "os",
196
+ "194": "ies",
197
+ "195": "ie",
198
+ "196": "▁pl",
199
+ "197": "▁tr",
200
+ "198": "ven",
201
+ "199": "ous",
202
+ "200": "▁le",
203
+ "201": "▁two",
204
+ "202": "ard",
205
+ "203": "ine",
206
+ "204": "▁co",
207
+ "205": "een",
208
+ "206": "▁now",
209
+ "207": "ty",
210
+ "208": "her",
211
+ "209": "ack",
212
+ "210": "▁pe",
213
+ "211": "ame",
214
+ "212": "▁how",
215
+ "213": "▁who",
216
+ "214": "▁see",
217
+ "215": "▁tim",
218
+ "216": "ect",
219
+ "217": "ast",
220
+ "218": "▁our",
221
+ "219": "ci",
222
+ "220": "ree",
223
+ "221": "ople",
224
+ "222": "gh",
225
+ "223": "▁no",
226
+ "224": "▁had",
227
+ "225": "▁man",
228
+ "226": "▁qu",
229
+ "227": "▁en",
230
+ "228": "ide",
231
+ "229": "ure",
232
+ "230": "ud",
233
+ "231": "so",
234
+ "232": "▁his",
235
+ "233": "▁sa",
236
+ "234": "▁sp",
237
+ "235": "▁say",
238
+ "236": "ose",
239
+ "237": "ther",
240
+ "238": "▁act",
241
+ "239": "▁ta",
242
+ "240": "▁cl",
243
+ "241": "ings",
244
+ "242": "pt",
245
+ "243": "king",
246
+ "244": "▁any",
247
+ "245": "▁has",
248
+ "246": "▁un",
249
+ "247": "iv",
250
+ "248": "▁im",
251
+ "249": "▁ag",
252
+ "250": "▁te",
253
+ "251": "▁fe",
254
+ "252": "one",
255
+ "253": "per",
256
+ "254": "ong",
257
+ "255": "▁po",
258
+ "256": "▁ad",
259
+ "257": "ff",
260
+ "258": "ore",
261
+ "259": "itt",
262
+ "260": "ans",
263
+ "261": "iz",
264
+ "262": "eah",
265
+ "263": "reat",
266
+ "264": "act",
267
+ "265": "own",
268
+ "266": "hing",
269
+ "267": "enty",
270
+ "268": "age",
271
+ "269": "ber",
272
+ "270": "ice",
273
+ "271": "▁am",
274
+ "272": "ple",
275
+ "273": "are",
276
+ "274": "▁per",
277
+ "275": "und",
278
+ "276": "ite",
279
+ "277": "ix",
280
+ "278": "pl",
281
+ "279": "▁way",
282
+ "280": "▁did",
283
+ "281": "▁pr",
284
+ "282": "▁got",
285
+ "283": "ars",
286
+ "284": "▁she",
287
+ "285": "▁let",
288
+ "286": "ag",
289
+ "287": "▁ac",
290
+ "288": "int",
291
+ "289": "▁ar",
292
+ "290": "ry",
293
+ "291": "ign",
294
+ "292": "ish",
295
+ "293": "▁fir",
296
+ "294": "ace",
297
+ "295": "ble",
298
+ "296": "og",
299
+ "297": "ue",
300
+ "298": "▁ye",
301
+ "299": "ap",
302
+ "300": "iff",
303
+ "301": "▁ro",
304
+ "302": "▁her",
305
+ "303": "nder",
306
+ "304": "▁ok",
307
+ "305": "▁res",
308
+ "306": "▁gu",
309
+ "307": "ence",
310
+ "308": "▁may",
311
+ "309": "ated",
312
+ "310": "ip",
313
+ "311": "▁bo",
314
+ "312": "▁him",
315
+ "313": "way",
316
+ "314": "ac",
317
+ "315": "ical",
318
+ "316": "ass",
319
+ "317": "ase",
320
+ "318": "▁dis",
321
+ "319": "able",
322
+ "320": "ick",
323
+ "321": "▁app",
324
+ "322": "ance",
325
+ "323": "▁pre",
326
+ "324": "▁six",
327
+ "325": "▁off",
328
+ "326": "▁new",
329
+ "327": "ia",
330
+ "328": "orm",
331
+ "329": "ank",
332
+ "330": "▁lot",
333
+ "331": "ach",
334
+ "332": "▁fo",
335
+ "333": "inet",
336
+ "334": "ire",
337
+ "335": "ary",
338
+ "336": "ult",
339
+ "337": "▁tal",
340
+ "338": "▁mu",
341
+ "339": "▁bl",
342
+ "340": "ount",
343
+ "341": "sel",
344
+ "342": "vel",
345
+ "343": "▁br",
346
+ "344": "▁imp",
347
+ "345": "ep",
348
+ "346": "cess",
349
+ "347": "ord",
350
+ "348": "▁sc",
351
+ "349": "▁inc",
352
+ "350": "ound",
353
+ "351": "ang",
354
+ "352": "be",
355
+ "353": "ress",
356
+ "354": "uct",
357
+ "355": "▁ind",
358
+ "356": "▁af",
359
+ "357": "ving",
360
+ "358": "▁oh",
361
+ "359": "▁bet",
362
+ "360": "▁use",
363
+ "361": "ome",
364
+ "362": "ens",
365
+ "363": "ys",
366
+ "364": "▁bu",
367
+ "365": "co",
368
+ "366": "ory",
369
+ "367": "ater",
370
+ "368": "ild",
371
+ "369": "ght",
372
+ "370": "ial",
373
+ "371": "▁day",
374
+ "372": "ning",
375
+ "373": "na",
376
+ "374": "ile",
377
+ "375": "▁spe",
378
+ "376": "▁mar",
379
+ "377": "ody",
380
+ "378": "ough",
381
+ "379": "ade",
382
+ "380": "vers",
383
+ "381": "xt",
384
+ "382": "▁fl",
385
+ "383": "▁ke",
386
+ "384": "ian",
387
+ "385": "▁sy",
388
+ "386": "▁put",
389
+ "387": "fore",
390
+ "388": "ub",
391
+ "389": "▁ph",
392
+ "390": "fe",
393
+ "391": "▁em",
394
+ "392": "▁ser",
395
+ "393": "form",
396
+ "394": "ting",
397
+ "395": "te",
398
+ "396": "av",
399
+ "397": "ious",
400
+ "398": "▁rec",
401
+ "399": "ks",
402
+ "400": "▁gr",
403
+ "401": "ces",
404
+ "402": "wn",
405
+ "403": "ors",
406
+ "404": "▁jo",
407
+ "405": "ents",
408
+ "406": "▁des",
409
+ "407": "▁try",
410
+ "408": "▁equ",
411
+ "409": "▁z",
412
+ "410": "▁rem",
413
+ "411": "▁str",
414
+ "412": "self",
415
+ "413": "▁bit",
416
+ "414": "ph",
417
+ "415": "ved",
418
+ "416": "▁why",
419
+ "417": "▁bas",
420
+ "418": "▁hel",
421
+ "419": "▁rel",
422
+ "420": "ath",
423
+ "421": "ject",
424
+ "422": "ail",
425
+ "423": "▁la",
426
+ "424": "ual",
427
+ "425": "▁god",
428
+ "426": "▁nat",
429
+ "427": "erm",
430
+ "428": "day",
431
+ "429": "▁id",
432
+ "430": "ft",
433
+ "431": "▁wr",
434
+ "432": "▁min",
435
+ "433": "ates",
436
+ "434": "▁gen",
437
+ "435": "tain",
438
+ "436": "▁ob",
439
+ "437": "ull",
440
+ "438": "ict",
441
+ "439": "▁tra",
442
+ "440": "▁end",
443
+ "441": "▁hig",
444
+ "442": "▁fif",
445
+ "443": "oth",
446
+ "444": "tern",
447
+ "445": "▁its",
448
+ "446": "vent",
449
+ "447": "▁sm",
450
+ "448": "ons",
451
+ "449": "▁add",
452
+ "450": "iss",
453
+ "451": "▁bel",
454
+ "452": "ful",
455
+ "453": "get",
456
+ "454": "▁ele",
457
+ "455": "▁rep",
458
+ "456": "ak",
459
+ "457": "▁ho",
460
+ "458": "▁pos",
461
+ "459": "▁num",
462
+ "460": "ange",
463
+ "461": "ves",
464
+ "462": "ific",
465
+ "463": "urn",
466
+ "464": "ise",
467
+ "465": "▁cr",
468
+ "466": "▁um",
469
+ "467": "ward",
470
+ "468": "▁reg",
471
+ "469": "ady",
472
+ "470": "ower",
473
+ "471": "uc",
474
+ "472": "▁dec",
475
+ "473": "lic",
476
+ "474": "▁set",
477
+ "475": "▁gon",
478
+ "476": "▁op",
479
+ "477": "▁ear",
480
+ "478": "▁sub",
481
+ "479": "▁sl",
482
+ "480": "les",
483
+ "481": "stem",
484
+ "482": "cial",
485
+ "483": "olog",
486
+ "484": "atch",
487
+ "485": "ily",
488
+ "486": "body",
489
+ "487": "nds",
490
+ "488": "ular",
491
+ "489": "ren",
492
+ "490": "▁own",
493
+ "491": "▁too",
494
+ "492": "cent",
495
+ "493": "ible",
496
+ "494": "pect",
497
+ "495": "ered",
498
+ "496": "ways",
499
+ "497": "teen",
500
+ "498": "▁uh",
501
+ "499": "▁big",
502
+ "500": "▁mod",
503
+ "501": "▁att",
504
+ "502": "▁car",
505
+ "503": "gr",
506
+ "504": "▁acc",
507
+ "505": "ied",
508
+ "506": "mun",
509
+ "507": "ib",
510
+ "508": "▁mon",
511
+ "509": "▁sch",
512
+ "510": "▁pol",
513
+ "511": "▁dat",
514
+ "512": "▁fin",
515
+ "513": "▁sim",
516
+ "514": "▁inv",
517
+ "515": "▁def",
518
+ "516": "ked",
519
+ "517": "▁ent",
520
+ "518": "▁yes",
521
+ "519": "ows",
522
+ "520": "ics",
523
+ "521": "ited",
524
+ "522": "ute",
525
+ "523": "ism",
526
+ "524": "ps",
527
+ "525": "▁ed",
528
+ "526": "▁el",
529
+ "527": "ably",
530
+ "528": "ppen",
531
+ "529": "als",
532
+ "530": "▁ten",
533
+ "531": "ract",
534
+ "532": "ss",
535
+ "533": "▁ass",
536
+ "534": "▁met",
537
+ "535": "gan",
538
+ "536": "▁eng",
539
+ "537": "▁stu",
540
+ "538": "ween",
541
+ "539": "arch",
542
+ "540": "▁gl",
543
+ "541": "▁cor",
544
+ "542": "▁dr",
545
+ "543": "vern",
546
+ "544": "▁ty",
547
+ "545": "▁run",
548
+ "546": "hip",
549
+ "547": "cus",
550
+ "548": "cond",
551
+ "549": "▁ins",
552
+ "550": "irty",
553
+ "551": "▁pub",
554
+ "552": "lud",
555
+ "553": "llow",
556
+ "554": "▁cou",
557
+ "555": "ew",
558
+ "556": "iew",
559
+ "557": "▁sur",
560
+ "558": "ero",
561
+ "559": "ood",
562
+ "560": "ness",
563
+ "561": "▁fun",
564
+ "562": "▁eff",
565
+ "563": "cept",
566
+ "564": "▁ca",
567
+ "565": "▁exp",
568
+ "566": "duct",
569
+ "567": "▁sw",
570
+ "568": "ize",
571
+ "569": "ope",
572
+ "570": "▁par",
573
+ "571": "kes",
574
+ "572": "cy",
575
+ "573": "▁ev",
576
+ "574": "▁ref",
577
+ "575": "ell",
578
+ "576": "▁bus",
579
+ "577": "ug",
580
+ "578": "rib",
581
+ "579": "▁cur",
582
+ "580": "mo",
583
+ "581": "ock",
584
+ "582": "ures",
585
+ "583": "air",
586
+ "584": "▁war",
587
+ "585": "str",
588
+ "586": "▁med",
589
+ "587": "▁wa",
590
+ "588": "▁val",
591
+ "589": "▁sin",
592
+ "590": "blem",
593
+ "591": "▁fam",
594
+ "592": "li",
595
+ "593": "▁far",
596
+ "594": "▁cle",
597
+ "595": "▁col",
598
+ "596": "mon",
599
+ "597": "▁gra",
600
+ "598": "led",
601
+ "599": "ense",
602
+ "600": "tin",
603
+ "601": "ues",
604
+ "602": "its",
605
+ "603": "▁mem",
606
+ "604": "▁inf",
607
+ "605": "▁eas",
608
+ "606": "ideo",
609
+ "607": "▁top",
610
+ "608": "io",
611
+ "609": "pan",
612
+ "610": "▁hum",
613
+ "611": "▁old",
614
+ "612": "ead",
615
+ "613": "▁ord",
616
+ "614": "ric",
617
+ "615": "ants",
618
+ "616": "oy",
619
+ "617": "esn",
620
+ "618": "uck",
621
+ "619": "ason",
622
+ "620": "ced",
623
+ "621": "ool",
624
+ "622": "rat",
625
+ "623": "ouse",
626
+ "624": "▁lar",
627
+ "625": "▁art",
628
+ "626": "▁wee",
629
+ "627": "▁cer",
630
+ "628": "ized",
631
+ "629": "▁mat",
632
+ "630": "con",
633
+ "631": "erg",
634
+ "632": "land",
635
+ "633": "ines",
636
+ "634": "▁chr",
637
+ "635": "▁aut",
638
+ "636": "▁lea",
639
+ "637": "▁sou",
640
+ "638": "oney",
641
+ "639": "tty",
642
+ "640": "▁ple",
643
+ "641": "ulat",
644
+ "642": "oks",
645
+ "643": "▁few",
646
+ "644": "▁sol",
647
+ "645": "▁che",
648
+ "646": "chn",
649
+ "647": "ird",
650
+ "648": "▁bre",
651
+ "649": "▁dur",
652
+ "650": "▁wom",
653
+ "651": "me",
654
+ "652": "izat",
655
+ "653": "eric",
656
+ "654": "ote",
657
+ "655": "▁uni",
658
+ "656": "eren",
659
+ "657": "arn",
660
+ "658": "ross",
661
+ "659": "ices",
662
+ "660": "ten",
663
+ "661": "eral",
664
+ "662": "ever",
665
+ "663": "ieve",
666
+ "664": "lish",
667
+ "665": "ash",
668
+ "666": "▁opp",
669
+ "667": "alth",
670
+ "668": "ger",
671
+ "669": "▁sk",
672
+ "670": "▁red",
673
+ "671": "peri",
674
+ "672": "▁det",
675
+ "673": "▁ext",
676
+ "674": "ner",
677
+ "675": "ah",
678
+ "676": "▁var",
679
+ "677": "▁loc",
680
+ "678": "gram",
681
+ "679": "ists",
682
+ "680": "ives",
683
+ "681": "▁es",
684
+ "682": "▁nor",
685
+ "683": "tro",
686
+ "684": "ale",
687
+ "685": "▁iss",
688
+ "686": "▁pri",
689
+ "687": "gin",
690
+ "688": "az",
691
+ "689": "oc",
692
+ "690": "▁pop",
693
+ "691": "ern",
694
+ "692": "▁sit",
695
+ "693": "ket",
696
+ "694": "▁pa",
697
+ "695": "▁law",
698
+ "696": "ages",
699
+ "697": "br",
700
+ "698": "▁cam",
701
+ "699": "▁mom",
702
+ "700": "osed",
703
+ "701": "▁bro",
704
+ "702": "ne",
705
+ "703": "bs",
706
+ "704": "▁cre",
707
+ "705": "erat",
708
+ "706": "▁sec",
709
+ "707": "▁cap",
710
+ "708": "▁vis",
711
+ "709": "▁pat",
712
+ "710": "ield",
713
+ "711": "iet",
714
+ "712": "▁tri",
715
+ "713": "up",
716
+ "714": "▁bra",
717
+ "715": "ts",
718
+ "716": "▁mot",
719
+ "717": "▁unt",
720
+ "718": "put",
721
+ "719": "bo",
722
+ "720": "ork",
723
+ "721": "mer",
724
+ "722": "ital",
725
+ "723": "▁air",
726
+ "724": "ined",
727
+ "725": "▁beh",
728
+ "726": "▁adv",
729
+ "727": "▁ret",
730
+ "728": "imes",
731
+ "729": "▁tea",
732
+ "730": "ural",
733
+ "731": "sid",
734
+ "732": "ters",
735
+ "733": "▁pur",
736
+ "734": "▁sci",
737
+ "735": "bers",
738
+ "736": "ient",
739
+ "737": "ier",
740
+ "738": "cc",
741
+ "739": "sw",
742
+ "740": "▁av",
743
+ "741": "reen",
744
+ "742": "ode",
745
+ "743": "ont",
746
+ "744": "▁dra",
747
+ "745": "ann",
748
+ "746": "nect",
749
+ "747": "▁x",
750
+ "748": "▁eu",
751
+ "749": "ton",
752
+ "750": "inat",
753
+ "751": "ene",
754
+ "752": "ared",
755
+ "753": "els",
756
+ "754": "▁mor",
757
+ "755": "▁rat",
758
+ "756": "cri",
759
+ "757": "▁men",
760
+ "758": "▁ah",
761
+ "759": "ames",
762
+ "760": "▁arm",
763
+ "761": "eak",
764
+ "762": "▁pay",
765
+ "763": "▁hal",
766
+ "764": "ins",
767
+ "765": "ilit",
768
+ "766": "stit",
769
+ "767": "▁ra",
770
+ "768": "▁leg",
771
+ "769": "cl",
772
+ "770": "pr",
773
+ "771": "▁wal",
774
+ "772": "▁bad",
775
+ "773": "▁ge",
776
+ "774": "roup",
777
+ "775": "▁mus",
778
+ "776": "man",
779
+ "777": "▁gi",
780
+ "778": "eds",
781
+ "779": "▁aw",
782
+ "780": "po",
783
+ "781": "ark",
784
+ "782": "row",
785
+ "783": "▁dep",
786
+ "784": "ully",
787
+ "785": "ral",
788
+ "786": "lect",
789
+ "787": "pend",
790
+ "788": "▁sev",
791
+ "789": "ime",
792
+ "790": "gest",
793
+ "791": "here",
794
+ "792": "▁yet",
795
+ "793": "ted",
796
+ "794": "▁rev",
797
+ "795": "ds",
798
+ "796": "▁ask",
799
+ "797": "less",
800
+ "798": "▁di",
801
+ "799": "ets",
802
+ "800": "line",
803
+ "801": "▁aff",
804
+ "802": "ired",
805
+ "803": "▁est",
806
+ "804": "ken",
807
+ "805": "vid",
808
+ "806": "most",
809
+ "807": "ivid",
810
+ "808": "unch",
811
+ "809": "par",
812
+ "810": "med",
813
+ "811": "rop",
814
+ "812": "ased",
815
+ "813": "eone",
816
+ "814": "▁ve",
817
+ "815": "▁abs",
818
+ "816": "ergy",
819
+ "817": "ret",
820
+ "818": "▁saw",
821
+ "819": "▁ey",
822
+ "820": "▁cal",
823
+ "821": "uat",
824
+ "822": "▁mid",
825
+ "823": "vat",
826
+ "824": "ream",
827
+ "825": "vice",
828
+ "826": "ians",
829
+ "827": "rent",
830
+ "828": "ctor",
831
+ "829": "err",
832
+ "830": "ush",
833
+ "831": "ases",
834
+ "832": "▁suc",
835
+ "833": "erms",
836
+ "834": "ave",
837
+ "835": "angu",
838
+ "836": "ries",
839
+ "837": "▁wo",
840
+ "838": "arts",
841
+ "839": "▁fil",
842
+ "840": "▁fat",
843
+ "841": "▁cho",
844
+ "842": "orts",
845
+ "843": "▁fre",
846
+ "844": "ee",
847
+ "845": "ught",
848
+ "846": "eng",
849
+ "847": "ump",
850
+ "848": "▁bar",
851
+ "849": "ying",
852
+ "850": "ane",
853
+ "851": "▁tem",
854
+ "852": "anks",
855
+ "853": "ury",
856
+ "854": "iat",
857
+ "855": "mit",
858
+ "856": "trol",
859
+ "857": "▁net",
860
+ "858": "▁maj",
861
+ "859": "▁cra",
862
+ "860": "ling",
863
+ "861": "▁fig",
864
+ "862": "orn",
865
+ "863": "icat",
866
+ "864": "pany",
867
+ "865": "▁occ",
868
+ "866": "ott",
869
+ "867": "ands",
870
+ "868": "▁exc",
871
+ "869": "▁mr",
872
+ "870": "ency",
873
+ "871": "rope",
874
+ "872": "itch",
875
+ "873": "▁lit",
876
+ "874": "abil",
877
+ "875": "not",
878
+ "876": "ma",
879
+ "877": "▁typ",
880
+ "878": "▁opt",
881
+ "879": "ob",
882
+ "880": "ser",
883
+ "881": "ety",
884
+ "882": "ms",
885
+ "883": "peci",
886
+ "884": "aces",
887
+ "885": "aut",
888
+ "886": "▁hon",
889
+ "887": "cuss",
890
+ "888": "▁sal",
891
+ "889": "▁sor",
892
+ "890": "att",
893
+ "891": "▁lab",
894
+ "892": "▁har",
895
+ "893": "urch",
896
+ "894": "nded",
897
+ "895": "uce",
898
+ "896": "ids",
899
+ "897": "▁hy",
900
+ "898": "▁fut",
901
+ "899": "▁ste",
902
+ "900": "ours",
903
+ "901": "ems",
904
+ "902": "utes",
905
+ "903": "ng",
906
+ "904": "ta",
907
+ "905": "▁won",
908
+ "906": "▁fa",
909
+ "907": "▁env",
910
+ "908": "ards",
911
+ "909": "▁job",
912
+ "910": "ium",
913
+ "911": "▁dot",
914
+ "912": "▁obv",
915
+ "913": "ina",
916
+ "914": "side",
917
+ "915": "elve",
918
+ "916": "cu",
919
+ "917": "▁jes",
920
+ "918": "▁pot",
921
+ "919": "▁pie",
922
+ "920": "▁tre",
923
+ "921": "▁hey",
924
+ "922": "▁mag",
925
+ "923": "ron",
926
+ "924": "▁key",
927
+ "925": "swer",
928
+ "926": "▁win",
929
+ "927": "ucat",
930
+ "928": "work",
931
+ "929": "ides",
932
+ "930": "▁low",
933
+ "931": "▁vol",
934
+ "932": "▁oth",
935
+ "933": "atic",
936
+ "934": "lf",
937
+ "935": "ads",
938
+ "936": "inds",
939
+ "937": "com",
940
+ "938": "ths",
941
+ "939": "▁ver",
942
+ "940": "ised",
943
+ "941": "lo",
944
+ "942": "▁squ",
945
+ "943": "▁cut",
946
+ "944": "oked",
947
+ "945": "irit",
948
+ "946": "ateg",
949
+ "947": "ppy",
950
+ "948": "mitt",
951
+ "949": "come",
952
+ "950": "hn",
953
+ "951": "igin",
954
+ "952": "mand",
955
+ "953": "▁dam",
956
+ "954": "ho",
957
+ "955": "▁da",
958
+ "956": "▁fur",
959
+ "957": "iron",
960
+ "958": "ilar",
961
+ "959": "▁fac",
962
+ "960": "▁neg",
963
+ "961": "▁ago",
964
+ "962": "ged",
965
+ "963": "miss",
966
+ "964": "enth",
967
+ "965": "▁dou",
968
+ "966": "▁hit",
969
+ "967": "▁guy",
970
+ "968": "▁bi",
971
+ "969": "ove",
972
+ "970": "fess",
973
+ "971": "ples",
974
+ "972": "owed",
975
+ "973": "ured",
976
+ "974": "▁ris",
977
+ "975": "ints",
978
+ "976": "rew",
979
+ "977": "▁sum",
980
+ "978": "▁hu",
981
+ "979": "ploy",
982
+ "980": "ude",
983
+ "981": "ried",
984
+ "982": "▁cir",
985
+ "983": "▁dev",
986
+ "984": "ear",
987
+ "985": "▁tot",
988
+ "986": "▁ann",
989
+ "987": "duc",
990
+ "988": "ik",
991
+ "989": "pon",
992
+ "990": "sted",
993
+ "991": "▁ide",
994
+ "992": "▁'",
995
+ "993": "ipp",
996
+ "994": "▁eat",
997
+ "995": "▁dom",
998
+ "996": "▁",
999
+ "997": "e",
1000
+ "998": "t",
1001
+ "999": "o",
1002
+ "1000": "a",
1003
+ "1001": "i",
1004
+ "1002": "n",
1005
+ "1003": "s",
1006
+ "1004": "r",
1007
+ "1005": "h",
1008
+ "1006": "l",
1009
+ "1007": "d",
1010
+ "1008": "u",
1011
+ "1009": "c",
1012
+ "1010": "m",
1013
+ "1011": "y",
1014
+ "1012": "g",
1015
+ "1013": "w",
1016
+ "1014": "f",
1017
+ "1015": "p",
1018
+ "1016": "b",
1019
+ "1017": "v",
1020
+ "1018": "k",
1021
+ "1019": "'",
1022
+ "1020": "j",
1023
+ "1021": "x",
1024
+ "1022": "q",
1025
+ "1023": "z",
1026
+ "1024": "<EOU>",
1027
+ "1025": "<EOB>"
1028
+ }
160ms/.DS_Store ADDED
Binary file (8.2 kB). View file
 
160ms/convert_parakeet_eou.py ADDED
@@ -0,0 +1,740 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """CLI for exporting Parakeet Realtime EOU 120M components to CoreML.
3
+
4
+ This model is a cache-aware streaming FastConformer-RNNT model optimized for
5
+ low-latency speech recognition with end-of-utterance detection.
6
+
7
+ Key differences from Parakeet TDT v3:
8
+ - Smaller model (120M vs 600M params)
9
+ - No duration outputs (standard RNNT, not TDT)
10
+ - Cache-aware streaming encoder (17 layers, attention context [70,1])
11
+ - Special <EOU> token for end-of-utterance detection
12
+ - Optimized for 80-160ms latency
13
+
14
+ Reference: https://huggingface.co/nvidia/parakeet_realtime_eou_120m-v1
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ from dataclasses import asdict
20
+ from pathlib import Path
21
+ from typing import Dict, Optional, Tuple
22
+
23
+ import coremltools as ct
24
+ import numpy as np
25
+ import soundfile as sf
26
+ import torch
27
+ import typer
28
+
29
+ import nemo.collections.asr as nemo_asr
30
+
31
+ from individual_components import (
32
+ DecoderWrapper,
33
+ EncoderWrapper,
34
+ ExportSettings,
35
+ JointWrapper,
36
+ JointDecisionWrapper,
37
+ JointDecisionSingleStep,
38
+ PreprocessorWrapper,
39
+ MelEncoderWrapper,
40
+ _coreml_convert,
41
+ )
42
+
43
+ def apply_stft_patch():
44
+ # Monkey patch coremltools.stft to handle extra arguments from newer torch versions
45
+ try:
46
+ import coremltools.converters.mil.frontend.torch.ops as torch_ops
47
+ _original_stft = torch_ops.stft
48
+
49
+ def patched_stft(context, node):
50
+ if len(node.inputs) > 8:
51
+ node.inputs = node.inputs[:8]
52
+ return _original_stft(context, node)
53
+
54
+ torch_ops.stft = patched_stft
55
+ if "stft" in torch_ops._TORCH_OPS_REGISTRY:
56
+ torch_ops._TORCH_OPS_REGISTRY["stft"] = patched_stft
57
+ print("Monkey patched coremltools.stft for compatibility.")
58
+ except Exception as e:
59
+ print(f"Warning: Could not monkey patch stft: {e}")
60
+
61
+ DEFAULT_MODEL_ID = "nvidia/parakeet_realtime_eou_120m-v1"
62
+ AUTHOR = "Fluid Inference"
63
+
64
+
65
+ def _compute_length(seconds: float, sample_rate: int) -> int:
66
+ return int(round(seconds * sample_rate))
67
+
68
+
69
+ def _prepare_audio(
70
+ validation_audio: Optional[Path],
71
+ sample_rate: int,
72
+ max_samples: int,
73
+ seed: Optional[int],
74
+ ) -> torch.Tensor:
75
+ if validation_audio is None:
76
+ if seed is not None:
77
+ torch.manual_seed(seed)
78
+ audio = torch.randn(1, max_samples, dtype=torch.float32)
79
+ return audio
80
+
81
+ data, sr = sf.read(str(validation_audio), dtype="float32")
82
+ if sr != sample_rate:
83
+ raise typer.BadParameter(
84
+ f"Validation audio sample rate {sr} does not match model rate {sample_rate}"
85
+ )
86
+
87
+ if data.ndim > 1:
88
+ data = data[:, 0]
89
+
90
+ if data.size == 0:
91
+ raise typer.BadParameter("Validation audio is empty")
92
+
93
+ if data.size < max_samples:
94
+ pad_width = max_samples - data.size
95
+ data = np.pad(data, (0, pad_width))
96
+ elif data.size > max_samples:
97
+ data = data[:max_samples]
98
+
99
+ audio = torch.from_numpy(data).unsqueeze(0).to(dtype=torch.float32)
100
+ return audio
101
+
102
+
103
+ def _save_mlpackage(model: ct.models.MLModel, path: Path, description: str) -> None:
104
+ try:
105
+ model.minimum_deployment_target = ct.target.iOS17
106
+ except Exception:
107
+ pass
108
+ model.short_description = description
109
+ model.author = AUTHOR
110
+ path.parent.mkdir(parents=True, exist_ok=True)
111
+ model.save(str(path))
112
+
113
+
114
+ def _tensor_shape(tensor: torch.Tensor) -> Tuple[int, ...]:
115
+ return tuple(int(dim) for dim in tensor.shape)
116
+
117
+
118
+ def _parse_compute_units(name: str) -> ct.ComputeUnit:
119
+ """Parse a human-friendly compute units string into ct.ComputeUnit."""
120
+ normalized = str(name).strip().upper()
121
+ mapping = {
122
+ "ALL": ct.ComputeUnit.ALL,
123
+ "CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
124
+ "CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
125
+ "CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
126
+ "CPU_AND_NEURALENGINE": ct.ComputeUnit.CPU_AND_NE,
127
+ }
128
+ if normalized not in mapping:
129
+ raise typer.BadParameter(
130
+ f"Unknown compute units '{name}'. Choose from: " + ", ".join(mapping.keys())
131
+ )
132
+ return mapping[normalized]
133
+
134
+
135
+ def _parse_compute_precision(name: Optional[str]) -> Optional[ct.precision]:
136
+ """Parse compute precision string into ct.precision or None."""
137
+ if name is None:
138
+ return None
139
+ normalized = str(name).strip().upper()
140
+ if normalized == "":
141
+ return None
142
+ mapping = {
143
+ "FLOAT32": ct.precision.FLOAT32,
144
+ "FLOAT16": ct.precision.FLOAT16,
145
+ }
146
+ if normalized not in mapping:
147
+ raise typer.BadParameter(
148
+ f"Unknown compute precision '{name}'. Choose from: "
149
+ + ", ".join(mapping.keys())
150
+ )
151
+ return mapping[normalized]
152
+
153
+
154
+ app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
155
+
156
+
157
+ @app.command()
158
+ def convert(
159
+ nemo_path: Optional[Path] = typer.Option(
160
+ None,
161
+ "--nemo-path",
162
+ exists=True,
163
+ resolve_path=True,
164
+ help="Path to parakeet_realtime_eou_120m-v1.nemo checkpoint (skip to auto-download)",
165
+ ),
166
+ model_id: str = typer.Option(
167
+ DEFAULT_MODEL_ID,
168
+ "--model-id",
169
+ help="Model identifier to download when --nemo-path is omitted",
170
+ ),
171
+ output_dir: Path = typer.Option(
172
+ Path("parakeet_eou_coreml"),
173
+ help="Directory where mlpackages and metadata will be written",
174
+ ),
175
+ preprocessor_cu: str = typer.Option(
176
+ "CPU_ONLY",
177
+ "--preprocessor-cu",
178
+ help="Compute units for preprocessor (default CPU_ONLY)",
179
+ ),
180
+ mel_encoder_cu: str = typer.Option(
181
+ "CPU_ONLY",
182
+ "--mel-encoder-cu",
183
+ help="Compute units for fused mel+encoder (default CPU_ONLY)",
184
+ ),
185
+ compute_precision: Optional[str] = typer.Option(
186
+ None,
187
+ "--compute-precision",
188
+ help="Export precision: FLOAT32 (default) or FLOAT16 to shrink non-quantized weights.",
189
+ ),
190
+ max_audio_seconds: float = typer.Option(
191
+ 15.0,
192
+ "--max-audio-seconds",
193
+ help="Maximum audio duration in seconds for the fixed window export",
194
+ ),
195
+ validation_audio: Optional[Path] = typer.Option(
196
+ None,
197
+ "--validation-audio",
198
+ exists=True,
199
+ resolve_path=True,
200
+ help="Path to a 16kHz WAV file for tracing (uses random if not provided)",
201
+ ),
202
+ ) -> None:
203
+ """Export all Parakeet Realtime EOU sub-modules to CoreML.
204
+
205
+ This exports the cache-aware streaming FastConformer-RNNT model for
206
+ low-latency speech recognition with end-of-utterance detection.
207
+ """
208
+ export_settings = ExportSettings(
209
+ output_dir=output_dir,
210
+ compute_units=ct.ComputeUnit.CPU_ONLY,
211
+ deployment_target=ct.target.iOS17,
212
+ compute_precision=_parse_compute_precision(compute_precision),
213
+ max_audio_seconds=max_audio_seconds,
214
+ max_symbol_steps=1,
215
+ )
216
+
217
+ typer.echo("Export configuration:")
218
+ typer.echo(asdict(export_settings))
219
+
220
+ output_dir.mkdir(parents=True, exist_ok=True)
221
+ pre_cu = _parse_compute_units(preprocessor_cu)
222
+ melenc_cu = _parse_compute_units(mel_encoder_cu)
223
+
224
+ if nemo_path is not None:
225
+ typer.echo(f"Loading NeMo model from {nemo_path}…")
226
+ # Try loading as generic ASRModel first, then specific class
227
+ try:
228
+ asr_model = nemo_asr.models.ASRModel.restore_from(
229
+ str(nemo_path), map_location="cpu"
230
+ )
231
+ except Exception:
232
+ # Fallback to EncDecRNNTBPEModel
233
+ asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
234
+ str(nemo_path), map_location="cpu"
235
+ )
236
+ checkpoint_meta = {
237
+ "type": "file",
238
+ "path": str(nemo_path),
239
+ }
240
+ else:
241
+ typer.echo(f"Downloading NeMo model via {model_id}…")
242
+ # Use ASRModel.from_pretrained as recommended for this model
243
+ try:
244
+ asr_model = nemo_asr.models.ASRModel.from_pretrained(
245
+ model_id, map_location="cpu"
246
+ )
247
+ except Exception:
248
+ # Fallback to EncDecRNNTBPEModel
249
+ asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
250
+ model_id, map_location="cpu"
251
+ )
252
+ checkpoint_meta = {
253
+ "type": "pretrained",
254
+ "model_id": model_id,
255
+ }
256
+ asr_model.eval()
257
+
258
+ # Print model info
259
+ typer.echo(f"Model class: {type(asr_model).__name__}")
260
+ typer.echo(f"Encoder class: {type(asr_model.encoder).__name__}")
261
+
262
+ sample_rate = int(asr_model.cfg.preprocessor.sample_rate)
263
+ max_samples = _compute_length(export_settings.max_audio_seconds, sample_rate)
264
+
265
+ # Prepare audio for tracing
266
+ if validation_audio is not None:
267
+ typer.echo(f"Using validation audio: {validation_audio}")
268
+ audio_tensor = _prepare_audio(validation_audio, sample_rate, max_samples, seed=None)
269
+ else:
270
+ typer.echo("Using random audio for tracing (seed=42)")
271
+ audio_tensor = _prepare_audio(None, sample_rate, max_samples, seed=42)
272
+
273
+ audio_length = torch.tensor([max_samples], dtype=torch.int32)
274
+
275
+ preprocessor = PreprocessorWrapper(asr_model.preprocessor.eval())
276
+ encoder = EncoderWrapper(asr_model.encoder.eval())
277
+ decoder = DecoderWrapper(asr_model.decoder.eval())
278
+ joint = JointWrapper(asr_model.joint.eval())
279
+
280
+ decoder_export_flag = getattr(asr_model.decoder, "_rnnt_export", False)
281
+ asr_model.decoder._rnnt_export = True
282
+
283
+ try:
284
+ with torch.no_grad():
285
+ mel_ref, mel_length_ref = preprocessor(audio_tensor, audio_length)
286
+ mel_length_ref = mel_length_ref.to(dtype=torch.int32)
287
+ encoder_ref, encoder_length_ref, frame_times_ref = encoder(
288
+ mel_ref, mel_length_ref
289
+ )
290
+ encoder_length_ref = encoder_length_ref.to(dtype=torch.int32)
291
+
292
+ # Clone tensors to drop inference flags
293
+ mel_ref = mel_ref.clone().detach()
294
+ mel_length_ref = mel_length_ref.clone().detach()
295
+ encoder_ref = encoder_ref.clone().detach()
296
+ encoder_length_ref = encoder_length_ref.clone().detach()
297
+ frame_times_ref = frame_times_ref.clone().detach()
298
+
299
+ vocab_size = int(asr_model.tokenizer.vocab_size)
300
+ decoder_hidden = int(asr_model.decoder.pred_hidden)
301
+ decoder_layers = int(asr_model.decoder.pred_rnn_layers)
302
+
303
+ # Check if model has extra outputs (TDT-style duration)
304
+ num_extra = getattr(asr_model.joint, "num_extra_outputs", 0)
305
+ typer.echo(f"Vocab size: {vocab_size}, num_extra_outputs: {num_extra}")
306
+
307
+ targets = torch.full(
308
+ (1, export_settings.max_symbol_steps),
309
+ fill_value=asr_model.decoder.blank_idx,
310
+ dtype=torch.int32,
311
+ )
312
+ target_lengths = torch.tensor(
313
+ [export_settings.max_symbol_steps], dtype=torch.int32
314
+ )
315
+ zero_state = torch.zeros(
316
+ decoder_layers,
317
+ 1,
318
+ decoder_hidden,
319
+ dtype=torch.float32,
320
+ )
321
+
322
+ with torch.no_grad():
323
+ decoder_ref, h_ref, c_ref = decoder(
324
+ targets, target_lengths, zero_state, zero_state
325
+ )
326
+ joint_ref = joint(encoder_ref, decoder_ref)
327
+
328
+ decoder_ref = decoder_ref.clone()
329
+ h_ref = h_ref.clone()
330
+ c_ref = c_ref.clone()
331
+ joint_ref = joint_ref.clone()
332
+
333
+ typer.echo(f"Encoder output shape: {encoder_ref.shape}")
334
+ typer.echo(f"Decoder output shape: {decoder_ref.shape}")
335
+ typer.echo(f"Joint output shape: {joint_ref.shape}")
336
+
337
+ # === Export Preprocessor ===
338
+ typer.echo("Tracing and converting preprocessor…")
339
+ preprocessor = preprocessor.cpu()
340
+ audio_tensor = audio_tensor.cpu()
341
+ audio_length = audio_length.cpu()
342
+ traced_preprocessor = torch.jit.trace(
343
+ preprocessor, (audio_tensor, audio_length), strict=False
344
+ )
345
+ traced_preprocessor.eval()
346
+ preprocessor_inputs = [
347
+ ct.TensorType(
348
+ name="audio_signal",
349
+ shape=(1, ct.RangeDim(1, max_samples)),
350
+ dtype=np.float32,
351
+ ),
352
+ ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
353
+ ]
354
+ preprocessor_outputs = [
355
+ ct.TensorType(name="mel", dtype=np.float32),
356
+ ct.TensorType(name="mel_length", dtype=np.int32),
357
+ ]
358
+ preprocessor_model = _coreml_convert(
359
+ traced_preprocessor,
360
+ preprocessor_inputs,
361
+ preprocessor_outputs,
362
+ export_settings,
363
+ compute_units_override=pre_cu,
364
+ )
365
+ preprocessor_path = output_dir / "parakeet_eou_preprocessor.mlpackage"
366
+ _save_mlpackage(
367
+ preprocessor_model,
368
+ preprocessor_path,
369
+ f"Parakeet EOU preprocessor ({max_audio_seconds}s window)",
370
+ )
371
+
372
+ # === Export Encoder ===
373
+ typer.echo("Tracing and converting encoder…")
374
+ traced_encoder = torch.jit.trace(
375
+ encoder, (mel_ref, mel_length_ref), strict=False
376
+ )
377
+ traced_encoder.eval()
378
+ encoder_inputs = [
379
+ ct.TensorType(
380
+ name="mel", shape=_tensor_shape(mel_ref), dtype=np.float32
381
+ ),
382
+ ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
383
+ ]
384
+ encoder_outputs = [
385
+ ct.TensorType(name="encoder", dtype=np.float32),
386
+ ct.TensorType(name="encoder_length", dtype=np.int32),
387
+ ct.TensorType(name="frame_times", dtype=np.float32),
388
+ ]
389
+ encoder_model = _coreml_convert(
390
+ traced_encoder,
391
+ encoder_inputs,
392
+ encoder_outputs,
393
+ export_settings,
394
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
395
+ )
396
+ encoder_path = output_dir / "parakeet_eou_encoder.mlpackage"
397
+ _save_mlpackage(
398
+ encoder_model,
399
+ encoder_path,
400
+ f"Parakeet EOU encoder ({max_audio_seconds}s window)",
401
+ )
402
+
403
+ # === Export Fused Mel+Encoder ===
404
+ typer.echo("Tracing and converting fused mel+encoder…")
405
+ mel_encoder = MelEncoderWrapper(preprocessor, encoder)
406
+ traced_mel_encoder = torch.jit.trace(
407
+ mel_encoder, (audio_tensor, audio_length), strict=False
408
+ )
409
+ traced_mel_encoder.eval()
410
+ mel_encoder_inputs = [
411
+ ct.TensorType(
412
+ name="audio_signal", shape=(1, max_samples), dtype=np.float32
413
+ ),
414
+ ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
415
+ ]
416
+ mel_encoder_outputs = [
417
+ ct.TensorType(name="encoder", dtype=np.float32),
418
+ ct.TensorType(name="encoder_length", dtype=np.int32),
419
+ ct.TensorType(name="frame_times", dtype=np.float32),
420
+ ]
421
+ mel_encoder_model = _coreml_convert(
422
+ traced_mel_encoder,
423
+ mel_encoder_inputs,
424
+ mel_encoder_outputs,
425
+ export_settings,
426
+ compute_units_override=melenc_cu,
427
+ )
428
+ mel_encoder_path = output_dir / "parakeet_eou_mel_encoder.mlpackage"
429
+ _save_mlpackage(
430
+ mel_encoder_model,
431
+ mel_encoder_path,
432
+ f"Parakeet EOU fused Mel+Encoder ({max_audio_seconds}s window)",
433
+ )
434
+
435
+ # === Export Decoder ===
436
+ typer.echo("Tracing and converting decoder…")
437
+ traced_decoder = torch.jit.trace(
438
+ decoder,
439
+ (targets, target_lengths, zero_state, zero_state),
440
+ strict=False,
441
+ )
442
+ traced_decoder.eval()
443
+ decoder_inputs = [
444
+ ct.TensorType(
445
+ name="targets", shape=_tensor_shape(targets), dtype=np.int32
446
+ ),
447
+ ct.TensorType(name="target_length", shape=(1,), dtype=np.int32),
448
+ ct.TensorType(
449
+ name="h_in", shape=_tensor_shape(zero_state), dtype=np.float32
450
+ ),
451
+ ct.TensorType(
452
+ name="c_in", shape=_tensor_shape(zero_state), dtype=np.float32
453
+ ),
454
+ ]
455
+ decoder_outputs = [
456
+ ct.TensorType(name="decoder", dtype=np.float32),
457
+ ct.TensorType(name="h_out", dtype=np.float32),
458
+ ct.TensorType(name="c_out", dtype=np.float32),
459
+ ]
460
+ decoder_model = _coreml_convert(
461
+ traced_decoder,
462
+ decoder_inputs,
463
+ decoder_outputs,
464
+ export_settings,
465
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
466
+ )
467
+ decoder_path = output_dir / "parakeet_eou_decoder.mlpackage"
468
+ _save_mlpackage(
469
+ decoder_model,
470
+ decoder_path,
471
+ "Parakeet EOU decoder (RNNT prediction network)",
472
+ )
473
+
474
+ # === Export Joint ===
475
+ typer.echo("Tracing and converting joint…")
476
+ traced_joint = torch.jit.trace(
477
+ joint,
478
+ (encoder_ref, decoder_ref),
479
+ strict=False,
480
+ )
481
+ traced_joint.eval()
482
+ joint_inputs = [
483
+ ct.TensorType(
484
+ name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32
485
+ ),
486
+ ct.TensorType(
487
+ name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32
488
+ ),
489
+ ]
490
+ joint_outputs = [
491
+ ct.TensorType(name="logits", dtype=np.float32),
492
+ ]
493
+ joint_model = _coreml_convert(
494
+ traced_joint,
495
+ joint_inputs,
496
+ joint_outputs,
497
+ export_settings,
498
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
499
+ )
500
+ joint_path = output_dir / "parakeet_eou_joint.mlpackage"
501
+ _save_mlpackage(
502
+ joint_model,
503
+ joint_path,
504
+ "Parakeet EOU joint network (RNNT)",
505
+ )
506
+
507
+ # === Export Joint Decision Head ===
508
+ typer.echo("Tracing and converting joint decision head…")
509
+ joint_decision = JointDecisionWrapper(joint, vocab_size=vocab_size)
510
+ traced_joint_decision = torch.jit.trace(
511
+ joint_decision,
512
+ (encoder_ref, decoder_ref),
513
+ strict=False,
514
+ )
515
+ traced_joint_decision.eval()
516
+ joint_decision_inputs = [
517
+ ct.TensorType(
518
+ name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32
519
+ ),
520
+ ct.TensorType(
521
+ name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32
522
+ ),
523
+ ]
524
+ joint_decision_outputs = [
525
+ ct.TensorType(name="token_id", dtype=np.int32),
526
+ ct.TensorType(name="token_prob", dtype=np.float32),
527
+ ]
528
+ joint_decision_model = _coreml_convert(
529
+ traced_joint_decision,
530
+ joint_decision_inputs,
531
+ joint_decision_outputs,
532
+ export_settings,
533
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
534
+ )
535
+ joint_decision_path = output_dir / "parakeet_eou_joint_decision.mlpackage"
536
+ _save_mlpackage(
537
+ joint_decision_model,
538
+ joint_decision_path,
539
+ "Parakeet EOU joint + decision head (softmax, argmax)",
540
+ )
541
+
542
+ # === Export Single-Step Joint Decision ===
543
+ typer.echo("Tracing and converting single-step joint decision…")
544
+ jd_single = JointDecisionSingleStep(joint, vocab_size=vocab_size)
545
+ # Create single-step slices from refs
546
+ enc_step = encoder_ref[:, :, :1].contiguous()
547
+ dec_step = decoder_ref[:, :, :1].contiguous()
548
+ traced_jd_single = torch.jit.trace(
549
+ jd_single,
550
+ (enc_step, dec_step),
551
+ strict=False,
552
+ )
553
+ traced_jd_single.eval()
554
+ jd_single_inputs = [
555
+ ct.TensorType(
556
+ name="encoder_step",
557
+ shape=(1, enc_step.shape[1], 1),
558
+ dtype=np.float32,
559
+ ),
560
+ ct.TensorType(
561
+ name="decoder_step",
562
+ shape=(1, dec_step.shape[1], 1),
563
+ dtype=np.float32,
564
+ ),
565
+ ]
566
+ jd_single_outputs = [
567
+ ct.TensorType(name="token_id", dtype=np.int32),
568
+ ct.TensorType(name="token_prob", dtype=np.float32),
569
+ ct.TensorType(name="top_k_ids", dtype=np.int32),
570
+ ct.TensorType(name="top_k_logits", dtype=np.float32),
571
+ ]
572
+ jd_single_model = _coreml_convert(
573
+ traced_jd_single,
574
+ jd_single_inputs,
575
+ jd_single_outputs,
576
+ export_settings,
577
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
578
+ )
579
+ jd_single_path = output_dir / "parakeet_eou_joint_decision_single_step.mlpackage"
580
+ _save_mlpackage(
581
+ jd_single_model,
582
+ jd_single_path,
583
+ "Parakeet EOU single-step joint decision (current frame)",
584
+ )
585
+
586
+ # === Save Metadata ===
587
+ metadata: Dict[str, object] = {
588
+ "model_id": model_id,
589
+ "model_name": "parakeet_realtime_eou_120m-v1",
590
+ "model_class": type(asr_model).__name__,
591
+ "encoder_class": type(asr_model.encoder).__name__,
592
+ "sample_rate": sample_rate,
593
+ "max_audio_seconds": export_settings.max_audio_seconds,
594
+ "max_audio_samples": max_samples,
595
+ "max_symbol_steps": export_settings.max_symbol_steps,
596
+ "vocab_size": vocab_size,
597
+ "vocab_with_blank": vocab_size + 1,
598
+ "decoder_hidden": decoder_hidden,
599
+ "decoder_layers": decoder_layers,
600
+ "num_extra_outputs": num_extra,
601
+ "has_eou_token": True,
602
+ "checkpoint": checkpoint_meta,
603
+ "coreml": {
604
+ "compute_units": export_settings.compute_units.name,
605
+ "compute_precision": (
606
+ export_settings.compute_precision.name
607
+ if export_settings.compute_precision is not None
608
+ else "FLOAT32"
609
+ ),
610
+ },
611
+ "components": {
612
+ "preprocessor": {
613
+ "inputs": {
614
+ "audio_signal": [1, max_samples],
615
+ "audio_length": [1],
616
+ },
617
+ "outputs": {
618
+ "mel": list(_tensor_shape(mel_ref)),
619
+ "mel_length": [1],
620
+ },
621
+ "path": preprocessor_path.name,
622
+ },
623
+ "encoder": {
624
+ "inputs": {
625
+ "mel": list(_tensor_shape(mel_ref)),
626
+ "mel_length": [1],
627
+ },
628
+ "outputs": {
629
+ "encoder": list(_tensor_shape(encoder_ref)),
630
+ "encoder_length": [1],
631
+ "frame_times": [1, _tensor_shape(encoder_ref)[2]],
632
+ },
633
+ "path": encoder_path.name,
634
+ },
635
+ "mel_encoder": {
636
+ "inputs": {
637
+ "audio_signal": [1, max_samples],
638
+ "audio_length": [1],
639
+ },
640
+ "outputs": {
641
+ "encoder": list(_tensor_shape(encoder_ref)),
642
+ "encoder_length": [1],
643
+ "frame_times": [1, _tensor_shape(encoder_ref)[2]],
644
+ },
645
+ "path": mel_encoder_path.name,
646
+ },
647
+ "decoder": {
648
+ "inputs": {
649
+ "targets": list(_tensor_shape(targets)),
650
+ "target_length": [1],
651
+ "h_in": list(_tensor_shape(zero_state)),
652
+ "c_in": list(_tensor_shape(zero_state)),
653
+ },
654
+ "outputs": {
655
+ "decoder": list(_tensor_shape(decoder_ref)),
656
+ "h_out": list(_tensor_shape(h_ref)),
657
+ "c_out": list(_tensor_shape(c_ref)),
658
+ },
659
+ "path": decoder_path.name,
660
+ },
661
+ "joint": {
662
+ "inputs": {
663
+ "encoder": list(_tensor_shape(encoder_ref)),
664
+ "decoder": list(_tensor_shape(decoder_ref)),
665
+ },
666
+ "outputs": {
667
+ "logits": list(_tensor_shape(joint_ref)),
668
+ },
669
+ "path": joint_path.name,
670
+ },
671
+ "joint_decision": {
672
+ "inputs": {
673
+ "encoder": list(_tensor_shape(encoder_ref)),
674
+ "decoder": list(_tensor_shape(decoder_ref)),
675
+ },
676
+ "outputs": {
677
+ "token_id": [
678
+ _tensor_shape(encoder_ref)[0],
679
+ _tensor_shape(encoder_ref)[2],
680
+ _tensor_shape(decoder_ref)[2],
681
+ ],
682
+ "token_prob": [
683
+ _tensor_shape(encoder_ref)[0],
684
+ _tensor_shape(encoder_ref)[2],
685
+ _tensor_shape(decoder_ref)[2],
686
+ ],
687
+ },
688
+ "path": joint_decision_path.name,
689
+ },
690
+ "joint_decision_single_step": {
691
+ "inputs": {
692
+ "encoder_step": [1, _tensor_shape(encoder_ref)[1], 1],
693
+ "decoder_step": [1, _tensor_shape(decoder_ref)[1], 1],
694
+ },
695
+ "outputs": {
696
+ "token_id": [1, 1, 1],
697
+ "token_prob": [1, 1, 1],
698
+ "top_k_ids": [1, 1, 1, 64],
699
+ "top_k_logits": [1, 1, 1, 64],
700
+ },
701
+ "path": jd_single_path.name,
702
+ },
703
+ },
704
+ }
705
+
706
+ # Export tokenizer vocab if available
707
+ try:
708
+ tokenizer = asr_model.tokenizer
709
+ vocab = {
710
+ "blank_id": int(asr_model.decoder.blank_idx),
711
+ "vocab_size": vocab_size,
712
+ }
713
+ # Try to get special tokens
714
+ if hasattr(tokenizer, "tokenizer"):
715
+ inner_tokenizer = tokenizer.tokenizer
716
+ if hasattr(inner_tokenizer, "get_vocab"):
717
+ full_vocab = inner_tokenizer.get_vocab()
718
+ # Find EOU token
719
+ eou_token = None
720
+ for token, idx in full_vocab.items():
721
+ if "<EOU>" in token.upper() or "eou" in token.lower():
722
+ eou_token = {"token": token, "id": idx}
723
+ break
724
+ if eou_token:
725
+ vocab["eou_token"] = eou_token
726
+ metadata["tokenizer"] = vocab
727
+ except Exception as e:
728
+ typer.echo(f"Warning: Could not export tokenizer info: {e}")
729
+
730
+ metadata_path = output_dir / "metadata.json"
731
+ metadata_path.write_text(json.dumps(metadata, indent=2))
732
+ typer.echo(f"\nExport complete. Metadata written to {metadata_path}")
733
+ typer.echo(f"Output directory: {output_dir}")
734
+
735
+ finally:
736
+ asr_model.decoder._rnnt_export = decoder_export_flag
737
+
738
+
739
+ if __name__ == "__main__":
740
+ app()
160ms/convert_streaming_encoder.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import torch.nn as nn
4
+ import coremltools as ct
5
+ import numpy as np
6
+ import typer
7
+ from pathlib import Path
8
+ from typing import Tuple, List, Optional
9
+ import json
10
+ import shutil
11
+
12
+ # Iimport torch
13
+ import coremltools as ct
14
+ import numpy as np
15
+ import argparse
16
+ from nemo.collections.asr.models import EncDecRNNTBPEModel
17
+
18
+ app = typer.Typer()
19
+
20
+ class LoopbackEncoderWrapper(nn.Module):
21
+ """
22
+ Wraps the entire Parakeet Encoder (PreEncode + Conformer) for CoreML Loopback Streaming.
23
+
24
+ Inputs:
25
+ - audio_signal: [B, D, T] (Mel spectrogram chunk)
26
+ - audio_length: [B]
27
+ - pre_cache: [B, D, pre_cache_size] (Previous audio context)
28
+ - cache_last_channel: [layers, B, cache_size, hidden]
29
+ - cache_last_time: [layers, B, hidden, time_cache]
30
+ - cache_last_channel_len: [B]
31
+
32
+ Outputs:
33
+ - encoded_output: [B, D_out, T_out]
34
+ - encoded_length: [B]
35
+ - new_pre_cache: [B, D, pre_cache_size]
36
+ - new_cache_last_channel
37
+ - new_cache_last_time
38
+ - new_cache_last_channel_len
39
+ """
40
+ def __init__(self, encoder, pre_cache_size=16):
41
+ super().__init__()
42
+ self.encoder = encoder
43
+ self.pre_cache_size = pre_cache_size
44
+
45
+ def forward(
46
+ self,
47
+ audio_signal: torch.Tensor,
48
+ audio_length: torch.Tensor,
49
+ pre_cache: torch.Tensor,
50
+ cache_last_channel: torch.Tensor,
51
+ cache_last_time: torch.Tensor,
52
+ cache_last_channel_len: torch.Tensor
53
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
54
+
55
+ # 1. Prepend pre_cache to audio_signal
56
+ # audio_signal: [B, D, T]
57
+ # pre_cache: [B, D, T_cache]
58
+ full_input = torch.cat([pre_cache, audio_signal], dim=2)
59
+ full_length = audio_length + self.pre_cache_size
60
+
61
+ # 2. Extract NEW pre_cache (last N frames of full_input)
62
+ # Note: We do this BEFORE processing because we want the raw audio context
63
+ new_pre_cache = full_input[:, :, -self.pre_cache_size:]
64
+
65
+ # 3. Process with Encoder
66
+ # Reconstruct NeMo cache object
67
+ current_cache = [cache_last_channel, cache_last_time, cache_last_channel_len]
68
+
69
+ encoded, encoded_len, new_cache_channel, new_cache_time, new_cache_len = self.encoder.cache_aware_stream_step(
70
+ processed_signal=full_input,
71
+ processed_signal_length=full_length,
72
+ cache_last_channel=cache_last_channel,
73
+ cache_last_time=cache_last_time,
74
+ cache_last_channel_len=cache_last_channel_len
75
+ )
76
+
77
+ # 4. Drop the first few frames corresponding to pre_cache?
78
+ # NeMo's cache_aware_stream_step usually handles the "valid" output frames.
79
+ # But since we manually prepended, we might get extra output frames.
80
+ # However, for streaming, we usually want the model to see the context but only output the new tokens.
81
+ # Let's trust NeMo's streaming logic for now, or check if we need to slice.
82
+ # Given we are using 'cache_aware_stream_step', it expects the full context window?
83
+ # Actually, standard usage is: input IS the new chunk, but internal convolution looks at past.
84
+ # But since we are stateless, we MUST provide the past.
85
+ # So passing (pre_cache + chunk) is correct.
86
+
87
+ # Cast lengths to Int32 for CoreML
88
+ encoded_len_32 = encoded_len.to(dtype=torch.int32)
89
+ new_channel_len_32 = new_cache_len.to(dtype=torch.int32)
90
+
91
+ return encoded, encoded_len_32, new_pre_cache, new_cache_channel, new_cache_time, new_channel_len_32
92
+
93
+ def _coreml_convert(
94
+ traced_model,
95
+ inputs,
96
+ outputs,
97
+ compute_units=ct.ComputeUnit.CPU_ONLY
98
+ ):
99
+ return ct.convert(
100
+ traced_model,
101
+ inputs=inputs,
102
+ outputs=outputs,
103
+ compute_units=compute_units,
104
+ minimum_deployment_target=ct.target.macOS14,
105
+ )
106
+
107
+ def main():
108
+ model_id: str = "nvidia/parakeet_realtime_eou_120m-v1"
109
+ output_dir: str = "temp_swift_models/StreamingLoopback"
110
+ output_path = Path(output_dir)
111
+ output_path.mkdir(parents=True, exist_ok=True)
112
+
113
+ print(f"Loading model: {model_id}...")
114
+ asr_model = EncDecRNNTBPEModel.from_pretrained(model_name=model_id)
115
+ asr_model.eval()
116
+
117
+ parser = argparse.ArgumentParser()
118
+ parser.add_argument("--chunk-frames", type=int, default=17, help="Number of frames in the input chunk (e.g. 17 for 160ms, 129 for 1.28s)")
119
+ args = parser.parse_args()
120
+
121
+ encoder = asr_model.encoder
122
+
123
+ # --- Configuration ---
124
+ # 160ms chunk = 16 frames (but preprocessor produces 17 with padding/centering)
125
+ # 1.28s chunk = 128 frames (preprocessor produces 129)
126
+ chunk_size_in = args.chunk_frames
127
+ mel_dim = 128
128
+ hidden_dim = encoder.d_model # 512
129
+ num_layers = len(encoder.layers) # 17
130
+
131
+ # Cache sizes
132
+ cache_channel_size = 70
133
+ cache_time_size = 8
134
+ pre_cache_size = 16
135
+
136
+ print(f"Config: Chunk={chunk_size_in}, Mel={mel_dim}, Hidden={hidden_dim}, Layers={num_layers}")
137
+ print(f"Cache: Channel={cache_channel_size}, Time={cache_time_size}, Pre={pre_cache_size}")
138
+
139
+ # --- Wrapper ---
140
+ wrapper = LoopbackEncoderWrapper(encoder, pre_cache_size=pre_cache_size)
141
+ wrapper.eval()
142
+
143
+ # --- Test Inputs (for Tracing) ---
144
+ batch_size = 1
145
+ test_mel = torch.randn(batch_size, mel_dim, chunk_size_in)
146
+ test_mel_len = torch.tensor([chunk_size_in], dtype=torch.int32)
147
+ test_pre_cache = torch.zeros(batch_size, mel_dim, pre_cache_size)
148
+
149
+ # Initial Cache (Zeros)
150
+ test_cache_channel = torch.zeros(num_layers, batch_size, cache_channel_size, hidden_dim)
151
+ test_cache_time = torch.zeros(num_layers, batch_size, hidden_dim, cache_time_size)
152
+ test_cache_len = torch.zeros(batch_size, dtype=torch.int32)
153
+
154
+ print("Tracing model...")
155
+ traced_model = torch.jit.trace(
156
+ wrapper,
157
+ (test_mel, test_mel_len, test_pre_cache, test_cache_channel, test_cache_time, test_cache_len),
158
+ strict=False
159
+ )
160
+
161
+ # --- CoreML Conversion ---
162
+ print("Converting to CoreML...")
163
+
164
+ inputs = [
165
+ ct.TensorType(name="audio_signal", shape=(1, 128, chunk_size_in), dtype=np.float32),
166
+ ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
167
+ ct.TensorType(name="pre_cache", shape=(1, 128, pre_cache_size), dtype=np.float32),
168
+ ct.TensorType(name="cache_last_channel", shape=(num_layers, 1, cache_channel_size, hidden_dim), dtype=np.float32),
169
+ ct.TensorType(name="cache_last_time", shape=(num_layers, 1, hidden_dim, cache_time_size), dtype=np.float32),
170
+ ct.TensorType(name="cache_last_channel_len", shape=(1,), dtype=np.int32),
171
+ ]
172
+
173
+ outputs = [
174
+ ct.TensorType(name="encoded_output", dtype=np.float32),
175
+ ct.TensorType(name="encoded_length", dtype=np.int32),
176
+ ct.TensorType(name="new_pre_cache", dtype=np.float32),
177
+ ct.TensorType(name="new_cache_last_channel", dtype=np.float32),
178
+ ct.TensorType(name="new_cache_last_time", dtype=np.float32),
179
+ ct.TensorType(name="new_cache_last_channel_len", dtype=np.int32),
180
+ ]
181
+
182
+ mlmodel = _coreml_convert(traced_model, inputs, outputs)
183
+
184
+ save_path = output_path / "streaming_encoder.mlpackage"
185
+ mlmodel.save(str(save_path))
186
+ print(f"Saved: {save_path}")
187
+
188
+ # Also export Preprocessor, Decoder, Joint for completeness?
189
+ # For now, let's assume we reuse the existing ones or export them separately if needed.
190
+ # But the user asked specifically for the Encoder loopback.
191
+
192
+ if __name__ == "__main__":
193
+ main()
160ms/decoder.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3996975a8cbc1949159c55605b3132b39b2484f51acbd55d796d93c70de02b49
3
+ size 243
160ms/decoder.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3ccbff963d8cf07e2be2bd56ea3384a89ea49628922c6bd95ff62e2ae57dc34
3
+ size 497
160ms/decoder.mlmodelc/metadata.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "shortDescription" : "Parakeet EOU decoder (RNNT prediction network)",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Float32",
10
+ "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
11
+ "shortDescription" : "",
12
+ "shape" : "[1, 640, 1]",
13
+ "name" : "decoder",
14
+ "type" : "MultiArray"
15
+ },
16
+ {
17
+ "hasShapeFlexibility" : "0",
18
+ "isOptional" : "0",
19
+ "dataType" : "Float32",
20
+ "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
21
+ "shortDescription" : "",
22
+ "shape" : "[1, 1, 640]",
23
+ "name" : "h_out",
24
+ "type" : "MultiArray"
25
+ },
26
+ {
27
+ "hasShapeFlexibility" : "0",
28
+ "isOptional" : "0",
29
+ "dataType" : "Float32",
30
+ "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
31
+ "shortDescription" : "",
32
+ "shape" : "[1, 1, 640]",
33
+ "name" : "c_out",
34
+ "type" : "MultiArray"
35
+ }
36
+ ],
37
+ "storagePrecision" : "Float16",
38
+ "modelParameters" : [
39
+
40
+ ],
41
+ "author" : "Fluid Inference",
42
+ "specificationVersion" : 8,
43
+ "mlProgramOperationTypeHistogram" : {
44
+ "Ios17.squeeze" : 2,
45
+ "Ios17.gather" : 1,
46
+ "Ios17.cast" : 6,
47
+ "Ios17.lstm" : 1,
48
+ "Ios17.transpose" : 2,
49
+ "Identity" : 1,
50
+ "Ios17.expandDims" : 2
51
+ },
52
+ "computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
53
+ "isUpdatable" : "0",
54
+ "stateSchema" : [
55
+
56
+ ],
57
+ "availability" : {
58
+ "macOS" : "14.0",
59
+ "tvOS" : "17.0",
60
+ "visionOS" : "1.0",
61
+ "watchOS" : "10.0",
62
+ "iOS" : "17.0",
63
+ "macCatalyst" : "17.0"
64
+ },
65
+ "modelType" : {
66
+ "name" : "MLModelType_mlProgram"
67
+ },
68
+ "inputSchema" : [
69
+ {
70
+ "hasShapeFlexibility" : "0",
71
+ "isOptional" : "0",
72
+ "dataType" : "Int32",
73
+ "formattedType" : "MultiArray (Int32 1 × 1)",
74
+ "shortDescription" : "",
75
+ "shape" : "[1, 1]",
76
+ "name" : "targets",
77
+ "type" : "MultiArray"
78
+ },
79
+ {
80
+ "hasShapeFlexibility" : "0",
81
+ "isOptional" : "0",
82
+ "dataType" : "Int32",
83
+ "formattedType" : "MultiArray (Int32 1)",
84
+ "shortDescription" : "",
85
+ "shape" : "[1]",
86
+ "name" : "target_length",
87
+ "type" : "MultiArray"
88
+ },
89
+ {
90
+ "hasShapeFlexibility" : "0",
91
+ "isOptional" : "0",
92
+ "dataType" : "Float32",
93
+ "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
94
+ "shortDescription" : "",
95
+ "shape" : "[1, 1, 640]",
96
+ "name" : "h_in",
97
+ "type" : "MultiArray"
98
+ },
99
+ {
100
+ "hasShapeFlexibility" : "0",
101
+ "isOptional" : "0",
102
+ "dataType" : "Float32",
103
+ "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
104
+ "shortDescription" : "",
105
+ "shape" : "[1, 1, 640]",
106
+ "name" : "c_in",
107
+ "type" : "MultiArray"
108
+ }
109
+ ],
110
+ "userDefinedMetadata" : {
111
+ "com.github.apple.coremltools.version" : "8.3.0",
112
+ "com.github.apple.coremltools.source" : "torch==2.4.0",
113
+ "com.github.apple.coremltools.source_dialect" : "TorchScript"
114
+ },
115
+ "generatedClassName" : "parakeet_eou_decoder",
116
+ "method" : "predict"
117
+ }
118
+ ]
160ms/decoder.mlmodelc/model.mil ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
3
+ {
4
+ func main<ios17>(tensor<fp32, [1, 1, 640]> c_in, tensor<fp32, [1, 1, 640]> h_in, tensor<int32, [1]> target_length, tensor<int32, [1, 1]> targets) {
5
+ tensor<int32, []> y_axis_0 = const()[name = tensor<string, []>("y_axis_0"), val = tensor<int32, []>(0)];
6
+ tensor<int32, []> y_batch_dims_0 = const()[name = tensor<string, []>("y_batch_dims_0"), val = tensor<int32, []>(0)];
7
+ tensor<bool, []> y_validate_indices_0 = const()[name = tensor<string, []>("y_validate_indices_0"), val = tensor<bool, []>(false)];
8
+ tensor<fp16, [1027, 640]> module_prediction_embed_weight_to_fp16 = const()[name = tensor<string, []>("module_prediction_embed_weight_to_fp16"), val = tensor<fp16, [1027, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
9
+ tensor<string, []> targets_to_int16_dtype_0 = const()[name = tensor<string, []>("targets_to_int16_dtype_0"), val = tensor<string, []>("int16")];
10
+ tensor<int16, [1, 1]> targets_to_int16 = cast(dtype = targets_to_int16_dtype_0, x = targets)[name = tensor<string, []>("cast_8")];
11
+ tensor<fp16, [1, 1, 640]> y_cast_fp16_cast_uint16 = gather(axis = y_axis_0, batch_dims = y_batch_dims_0, indices = targets_to_int16, validate_indices = y_validate_indices_0, x = module_prediction_embed_weight_to_fp16)[name = tensor<string, []>("y_cast_fp16_cast_uint16")];
12
+ tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
13
+ tensor<int32, [1]> input_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
14
+ tensor<string, []> h_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("h_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
15
+ tensor<fp16, [1, 1, 640]> h_in_to_fp16 = cast(dtype = h_in_to_fp16_dtype_0, x = h_in)[name = tensor<string, []>("cast_7")];
16
+ tensor<fp16, [1, 640]> input_lstm_h0_squeeze_cast_fp16 = squeeze(axes = input_lstm_h0_squeeze_axes_0, x = h_in_to_fp16)[name = tensor<string, []>("input_lstm_h0_squeeze_cast_fp16")];
17
+ tensor<int32, [1]> input_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
18
+ tensor<string, []> c_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("c_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
19
+ tensor<fp16, [1, 1, 640]> c_in_to_fp16 = cast(dtype = c_in_to_fp16_dtype_0, x = c_in)[name = tensor<string, []>("cast_6")];
20
+ tensor<fp16, [1, 640]> input_lstm_c0_squeeze_cast_fp16 = squeeze(axes = input_lstm_c0_squeeze_axes_0, x = c_in_to_fp16)[name = tensor<string, []>("input_lstm_c0_squeeze_cast_fp16")];
21
+ tensor<string, []> input_direction_0 = const()[name = tensor<string, []>("input_direction_0"), val = tensor<string, []>("forward")];
22
+ tensor<bool, []> input_output_sequence_0 = const()[name = tensor<string, []>("input_output_sequence_0"), val = tensor<bool, []>(true)];
23
+ tensor<string, []> input_recurrent_activation_0 = const()[name = tensor<string, []>("input_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
24
+ tensor<string, []> input_cell_activation_0 = const()[name = tensor<string, []>("input_cell_activation_0"), val = tensor<string, []>("tanh")];
25
+ tensor<string, []> input_activation_0 = const()[name = tensor<string, []>("input_activation_0"), val = tensor<string, []>("tanh")];
26
+ tensor<fp16, [2560, 640]> concat_1_to_fp16 = const()[name = tensor<string, []>("concat_1_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1314688)))];
27
+ tensor<fp16, [2560, 640]> concat_2_to_fp16 = const()[name = tensor<string, []>("concat_2_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4591552)))];
28
+ tensor<fp16, [2560]> concat_0_to_fp16 = const()[name = tensor<string, []>("concat_0_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7868416)))];
29
+ tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = y_cast_fp16_cast_uint16)[name = tensor<string, []>("transpose_2")];
30
+ tensor<fp16, [1, 1, 640]> input_cast_fp16_0, tensor<fp16, [1, 640]> input_cast_fp16_1, tensor<fp16, [1, 640]> input_cast_fp16_2 = lstm(activation = input_activation_0, bias = concat_0_to_fp16, cell_activation = input_cell_activation_0, direction = input_direction_0, initial_c = input_lstm_c0_squeeze_cast_fp16, initial_h = input_lstm_h0_squeeze_cast_fp16, output_sequence = input_output_sequence_0, recurrent_activation = input_recurrent_activation_0, weight_hh = concat_2_to_fp16, weight_ih = concat_1_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
31
+ tensor<int32, [1]> obj_3_axes_0 = const()[name = tensor<string, []>("obj_3_axes_0"), val = tensor<int32, [1]>([0])];
32
+ tensor<fp16, [1, 1, 640]> obj_3_cast_fp16 = expand_dims(axes = obj_3_axes_0, x = input_cast_fp16_1)[name = tensor<string, []>("obj_3_cast_fp16")];
33
+ tensor<string, []> obj_3_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_3_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
34
+ tensor<int32, [1]> obj_axes_0 = const()[name = tensor<string, []>("obj_axes_0"), val = tensor<int32, [1]>([0])];
35
+ tensor<fp16, [1, 1, 640]> obj_cast_fp16 = expand_dims(axes = obj_axes_0, x = input_cast_fp16_2)[name = tensor<string, []>("obj_cast_fp16")];
36
+ tensor<string, []> obj_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
37
+ tensor<int32, [3]> transpose_0_perm_0 = const()[name = tensor<string, []>("transpose_0_perm_0"), val = tensor<int32, [3]>([1, 2, 0])];
38
+ tensor<string, []> transpose_0_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("transpose_0_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
39
+ tensor<fp16, [1, 640, 1]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = input_cast_fp16_0)[name = tensor<string, []>("transpose_1")];
40
+ tensor<fp32, [1, 640, 1]> decoder = cast(dtype = transpose_0_cast_fp16_to_fp32_dtype_0, x = transpose_0_cast_fp16)[name = tensor<string, []>("cast_3")];
41
+ tensor<fp32, [1, 1, 640]> c_out = cast(dtype = obj_cast_fp16_to_fp32_dtype_0, x = obj_cast_fp16)[name = tensor<string, []>("cast_4")];
42
+ tensor<fp32, [1, 1, 640]> h_out = cast(dtype = obj_3_cast_fp16_to_fp32_dtype_0, x = obj_3_cast_fp16)[name = tensor<string, []>("cast_5")];
43
+ tensor<int32, [1]> target_length_tmp = identity(x = target_length)[name = tensor<string, []>("target_length_tmp")];
44
+ } -> (decoder, h_out, c_out);
45
+ }
160ms/decoder.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b4cacecdcd9df79ab1e56de67230baf5a8664d2afe0bb8f3408eefa972cb2f4
3
+ size 7873600
160ms/decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09f2dbd1f6a06faa6995f71d4b25d7c446996b6059cfac5ecc889853bdc7c6e5
3
+ size 6728
160ms/decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b4cacecdcd9df79ab1e56de67230baf5a8664d2afe0bb8f3408eefa972cb2f4
3
+ size 7873600
160ms/decoder.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "8201D73A-2B5D-488C-9C2B-7E2E75DF700D": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "F8EEBE8D-F17D-4556-B8DF-9BC11701B36D": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "F8EEBE8D-F17D-4556-B8DF-9BC11701B36D"
18
+ }
160ms/individual_components.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Export Parakeet Realtime EOU 120M RNNT components into CoreML."""
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Optional, Tuple
8
+
9
+ import coremltools as ct
10
+ import torch
11
+
12
+
13
+ @dataclass
14
+ class ExportSettings:
15
+ output_dir: Path
16
+ compute_units: ct.ComputeUnit
17
+ deployment_target: Optional[ct.target]
18
+ compute_precision: Optional[ct.precision]
19
+ max_audio_seconds: float
20
+ max_symbol_steps: int
21
+
22
+
23
+ class PreprocessorWrapper(torch.nn.Module):
24
+ """Wrapper for the audio preprocessor (mel spectrogram extraction)."""
25
+
26
+ def __init__(self, module: torch.nn.Module) -> None:
27
+ super().__init__()
28
+ self.module = module
29
+
30
+ def forward(
31
+ self, audio_signal: torch.Tensor, length: torch.Tensor
32
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
33
+ mel, mel_length = self.module(
34
+ input_signal=audio_signal, length=length.to(dtype=torch.long)
35
+ )
36
+ return mel, mel_length
37
+
38
+
39
+ class EncoderWrapper(torch.nn.Module):
40
+ """Wrapper for the cache-aware FastConformer encoder.
41
+
42
+ Note: For the realtime EOU model, the encoder is cache-aware which means
43
+ it can operate in a streaming fashion. For CoreML export, we export
44
+ without cache state for simplicity (full-context mode).
45
+ """
46
+
47
+ def __init__(self, module: torch.nn.Module) -> None:
48
+ super().__init__()
49
+ self.module = module
50
+
51
+ def forward(
52
+ self, features: torch.Tensor, length: torch.Tensor
53
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
54
+ encoded, encoded_lengths = self.module(
55
+ audio_signal=features, length=length.to(dtype=torch.long)
56
+ )
57
+ # Synthesize per-frame timestamps (seconds) using the 80 ms encoder stride.
58
+ # Shape: [B, T_enc]
59
+ frame_times = (
60
+ torch.arange(encoded.shape[-1], device=encoded.device, dtype=torch.float32)
61
+ * 0.08
62
+ )
63
+ return encoded, encoded_lengths, frame_times
64
+
65
+
66
+ class DecoderWrapper(torch.nn.Module):
67
+ """Wrapper for the RNNT prediction network (decoder)."""
68
+
69
+ def __init__(self, module: torch.nn.Module) -> None:
70
+ super().__init__()
71
+ self.module = module
72
+
73
+ def forward(
74
+ self,
75
+ targets: torch.Tensor,
76
+ target_lengths: torch.Tensor,
77
+ h_in: torch.Tensor,
78
+ c_in: torch.Tensor,
79
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
80
+ state = [h_in, c_in]
81
+ decoder_output, _, new_state = self.module(
82
+ targets=targets.to(dtype=torch.long),
83
+ target_length=target_lengths.to(dtype=torch.long),
84
+ states=state,
85
+ )
86
+ return decoder_output, new_state[0], new_state[1]
87
+
88
+
89
+ class JointWrapper(torch.nn.Module):
90
+ """Wrapper for the RNNT joint network.
91
+
92
+ Note: Unlike Parakeet TDT v3, the realtime EOU model does NOT have
93
+ duration outputs (num_extra_outputs). The joint network outputs only
94
+ token logits over the vocabulary + blank.
95
+ """
96
+
97
+ def __init__(self, module: torch.nn.Module) -> None:
98
+ super().__init__()
99
+ self.module = module
100
+
101
+ def forward(
102
+ self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor
103
+ ) -> torch.Tensor:
104
+ # Input: encoder_outputs [B, D, T], decoder_outputs [B, D, U]
105
+ # Transpose to match what projection layers expect
106
+ encoder_outputs = encoder_outputs.transpose(1, 2) # [B, T, D]
107
+ decoder_outputs = decoder_outputs.transpose(1, 2) # [B, U, D]
108
+
109
+ # Apply projections
110
+ enc_proj = self.module.enc(encoder_outputs) # [B, T, joint_hidden]
111
+ dec_proj = self.module.pred(decoder_outputs) # [B, U, joint_hidden]
112
+
113
+ # Explicit broadcasting along T and U to avoid converter ambiguity
114
+ x = enc_proj.unsqueeze(2) + dec_proj.unsqueeze(1) # [B, T, U, joint_hidden]
115
+ x = self.module.joint_net[0](x) # ReLU
116
+ x = self.module.joint_net[1](x) # Dropout (no-op in eval)
117
+ out = self.module.joint_net[2](x) # Linear -> logits [B, T, U, vocab+blank]
118
+ return out
119
+
120
+
121
+ class MelEncoderWrapper(torch.nn.Module):
122
+ """Fused wrapper: waveform -> mel -> encoder.
123
+
124
+ Inputs:
125
+ - audio_signal: [B, S]
126
+ - audio_length: [B]
127
+
128
+ Outputs:
129
+ - encoder: [B, D, T_enc]
130
+ - encoder_length: [B]
131
+ - frame_times: [T_enc]
132
+ """
133
+
134
+ def __init__(
135
+ self, preprocessor: PreprocessorWrapper, encoder: EncoderWrapper
136
+ ) -> None:
137
+ super().__init__()
138
+ self.preprocessor = preprocessor
139
+ self.encoder = encoder
140
+
141
+ def forward(
142
+ self, audio_signal: torch.Tensor, audio_length: torch.Tensor
143
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
144
+ mel, mel_length = self.preprocessor(audio_signal, audio_length)
145
+ encoded, enc_len, frame_times = self.encoder(mel, mel_length.to(dtype=torch.int32))
146
+ return encoded, enc_len, frame_times
147
+
148
+
149
+ class JointDecisionWrapper(torch.nn.Module):
150
+ """Joint + decision head: outputs label id and label prob.
151
+
152
+ Unlike Parakeet TDT v3, this model does NOT have duration outputs.
153
+
154
+ Inputs:
155
+ - encoder_outputs: [B, D, T]
156
+ - decoder_outputs: [B, D, U]
157
+
158
+ Returns:
159
+ - token_id: [B, T, U] int32
160
+ - token_prob: [B, T, U] float32
161
+ """
162
+
163
+ def __init__(self, joint: JointWrapper, vocab_size: int) -> None:
164
+ super().__init__()
165
+ self.joint = joint
166
+ self.vocab_with_blank = int(vocab_size) + 1
167
+
168
+ def forward(self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor):
169
+ logits = self.joint(encoder_outputs, decoder_outputs)
170
+
171
+ # Token selection
172
+ token_ids = torch.argmax(logits, dim=-1).to(dtype=torch.int32)
173
+ token_probs_all = torch.softmax(logits, dim=-1)
174
+ # gather expects int64 (long) indices; cast only for gather
175
+ token_prob = torch.gather(
176
+ token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
177
+ ).squeeze(-1)
178
+
179
+ return token_ids, token_prob
180
+
181
+
182
+ class JointDecisionSingleStep(torch.nn.Module):
183
+ """Single-step variant for streaming: encoder_step -> token decision.
184
+
185
+ Inputs:
186
+ - encoder_step: [B=1, D, T=1]
187
+ - decoder_step: [B=1, D, U=1]
188
+
189
+ Returns:
190
+ - token_id: [1, 1, 1] int32
191
+ - token_prob: [1, 1, 1] float32
192
+ - top_k_ids: [1, 1, 1, K] int32
193
+ - top_k_logits: [1, 1, 1, K] float32
194
+ """
195
+
196
+ def __init__(self, joint: JointWrapper, vocab_size: int, top_k: int = 64) -> None:
197
+ super().__init__()
198
+ self.joint = joint
199
+ self.vocab_with_blank = int(vocab_size) + 1
200
+ self.top_k = int(top_k)
201
+
202
+ def forward(self, encoder_step: torch.Tensor, decoder_step: torch.Tensor):
203
+ # Reuse JointWrapper which expects [B, D, T] and [B, D, U]
204
+ logits = self.joint(encoder_step, decoder_step) # [1, 1, 1, V+blank]
205
+
206
+ token_ids = torch.argmax(logits, dim=-1, keepdim=False).to(dtype=torch.int32)
207
+ token_probs_all = torch.softmax(logits, dim=-1)
208
+ token_prob = torch.gather(
209
+ token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
210
+ ).squeeze(-1)
211
+
212
+ # Also expose top-K candidates for host-side processing
213
+ topk_logits, topk_ids_long = torch.topk(
214
+ logits, k=min(self.top_k, logits.shape[-1]), dim=-1
215
+ )
216
+ topk_ids = topk_ids_long.to(dtype=torch.int32)
217
+ return token_ids, token_prob, topk_ids, topk_logits
218
+
219
+
220
+ def _coreml_convert(
221
+ traced: torch.jit.ScriptModule,
222
+ inputs,
223
+ outputs,
224
+ settings: ExportSettings,
225
+ compute_units_override: Optional[ct.ComputeUnit] = None,
226
+ compute_precision: Optional[ct.precision] = None,
227
+ ) -> ct.models.MLModel:
228
+ cu = (
229
+ compute_units_override
230
+ if compute_units_override is not None
231
+ else settings.compute_units
232
+ )
233
+ kwargs = {
234
+ "convert_to": "mlprogram",
235
+ "inputs": inputs,
236
+ "outputs": outputs,
237
+ "compute_units": cu,
238
+ }
239
+ print("Converting:", traced.__class__.__name__)
240
+ print("Conversion kwargs:", kwargs)
241
+ if settings.deployment_target is not None:
242
+ kwargs["minimum_deployment_target"] = settings.deployment_target
243
+
244
+ # Priority: explicit argument > settings
245
+ if compute_precision is not None:
246
+ kwargs["compute_precision"] = compute_precision
247
+ elif settings.compute_precision is not None:
248
+ kwargs["compute_precision"] = settings.compute_precision
249
+
250
+ return ct.convert(traced, **kwargs)
160ms/joint_decision.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bca32ad130dcad6605cc00044c752aa5b45ef57d14c17f2d1a2fa49d6cf55b5
3
+ size 243
160ms/joint_decision.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22d4abc4625b935ee035b5f8ce7cb28d1041b9b01c12173e287bf4b5f5d99625
3
+ size 493
160ms/joint_decision.mlmodelc/metadata.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "shortDescription" : "Parakeet EOU single-step joint decision",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Int32",
10
+ "formattedType" : "MultiArray (Int32 1 × 1 × 1)",
11
+ "shortDescription" : "",
12
+ "shape" : "[1, 1, 1]",
13
+ "name" : "token_id",
14
+ "type" : "MultiArray"
15
+ },
16
+ {
17
+ "hasShapeFlexibility" : "0",
18
+ "isOptional" : "0",
19
+ "dataType" : "Float32",
20
+ "formattedType" : "MultiArray (Float32 1 × 1 × 1)",
21
+ "shortDescription" : "",
22
+ "shape" : "[1, 1, 1]",
23
+ "name" : "token_prob",
24
+ "type" : "MultiArray"
25
+ },
26
+ {
27
+ "hasShapeFlexibility" : "0",
28
+ "isOptional" : "0",
29
+ "dataType" : "Int32",
30
+ "formattedType" : "MultiArray (Int32 1 × 1 × 1 × 64)",
31
+ "shortDescription" : "",
32
+ "shape" : "[1, 1, 1, 64]",
33
+ "name" : "top_k_ids",
34
+ "type" : "MultiArray"
35
+ },
36
+ {
37
+ "hasShapeFlexibility" : "0",
38
+ "isOptional" : "0",
39
+ "dataType" : "Float32",
40
+ "formattedType" : "MultiArray (Float32 1 × 1 × 1 × 64)",
41
+ "shortDescription" : "",
42
+ "shape" : "[1, 1, 1, 64]",
43
+ "name" : "top_k_logits",
44
+ "type" : "MultiArray"
45
+ }
46
+ ],
47
+ "storagePrecision" : "Float16",
48
+ "modelParameters" : [
49
+
50
+ ],
51
+ "author" : "Fluid Inference",
52
+ "specificationVersion" : 8,
53
+ "mlProgramOperationTypeHistogram" : {
54
+ "Ios17.reduceArgmax" : 1,
55
+ "Ios17.squeeze" : 1,
56
+ "Ios17.cast" : 6,
57
+ "Ios17.linear" : 3,
58
+ "Ios17.transpose" : 2,
59
+ "Ios17.add" : 1,
60
+ "Ios16.relu" : 1,
61
+ "Ios16.softmax" : 1,
62
+ "Ios17.gatherAlongAxis" : 1,
63
+ "Ios17.topk" : 1,
64
+ "Ios17.expandDims" : 3
65
+ },
66
+ "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
67
+ "isUpdatable" : "0",
68
+ "stateSchema" : [
69
+
70
+ ],
71
+ "availability" : {
72
+ "macOS" : "14.0",
73
+ "tvOS" : "17.0",
74
+ "visionOS" : "1.0",
75
+ "watchOS" : "10.0",
76
+ "iOS" : "17.0",
77
+ "macCatalyst" : "17.0"
78
+ },
79
+ "modelType" : {
80
+ "name" : "MLModelType_mlProgram"
81
+ },
82
+ "inputSchema" : [
83
+ {
84
+ "hasShapeFlexibility" : "0",
85
+ "isOptional" : "0",
86
+ "dataType" : "Float32",
87
+ "formattedType" : "MultiArray (Float32 1 × 512 × 1)",
88
+ "shortDescription" : "",
89
+ "shape" : "[1, 512, 1]",
90
+ "name" : "encoder_step",
91
+ "type" : "MultiArray"
92
+ },
93
+ {
94
+ "hasShapeFlexibility" : "0",
95
+ "isOptional" : "0",
96
+ "dataType" : "Float32",
97
+ "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
98
+ "shortDescription" : "",
99
+ "shape" : "[1, 640, 1]",
100
+ "name" : "decoder_step",
101
+ "type" : "MultiArray"
102
+ }
103
+ ],
104
+ "userDefinedMetadata" : {
105
+ "com.github.apple.coremltools.source_dialect" : "TorchScript",
106
+ "com.github.apple.coremltools.version" : "8.3.0",
107
+ "com.github.apple.coremltools.source" : "torch==2.4.0"
108
+ },
109
+ "generatedClassName" : "parakeet_eou_joint_decision_single_step",
110
+ "method" : "predict"
111
+ }
112
+ ]
160ms/joint_decision.mlmodelc/model.mil ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
3
+ {
4
+ func main<ios17>(tensor<fp32, [1, 640, 1]> decoder_step, tensor<fp32, [1, 512, 1]> encoder_step) {
5
+ tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
6
+ tensor<string, []> encoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
7
+ tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
8
+ tensor<string, []> decoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
9
+ tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
10
+ tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
11
+ tensor<fp16, [1, 512, 1]> encoder_step_to_fp16 = cast(dtype = encoder_step_to_fp16_dtype_0, x = encoder_step)[name = tensor<string, []>("cast_8")];
12
+ tensor<fp16, [1, 1, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_step_to_fp16)[name = tensor<string, []>("transpose_1")];
13
+ tensor<fp16, [1, 1, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
14
+ tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
15
+ tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
16
+ tensor<fp16, [1, 640, 1]> decoder_step_to_fp16 = cast(dtype = decoder_step_to_fp16_dtype_0, x = decoder_step)[name = tensor<string, []>("cast_7")];
17
+ tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_step_to_fp16)[name = tensor<string, []>("transpose_0")];
18
+ tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
19
+ tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
20
+ tensor<fp16, [1, 1, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
21
+ tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
22
+ tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
23
+ tensor<fp16, [1, 1, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
24
+ tensor<fp16, [1, 1, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
25
+ tensor<fp16, [1027, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1027, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
26
+ tensor<fp16, [1027]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1027]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2792064)))];
27
+ tensor<fp16, [1, 1, 1, 1027]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
28
+ tensor<int32, []> var_38_axis_0 = const()[name = tensor<string, []>("op_38_axis_0"), val = tensor<int32, []>(-1)];
29
+ tensor<bool, []> var_38_keep_dims_0 = const()[name = tensor<string, []>("op_38_keep_dims_0"), val = tensor<bool, []>(false)];
30
+ tensor<string, []> var_38_output_dtype_0 = const()[name = tensor<string, []>("op_38_output_dtype_0"), val = tensor<string, []>("int32")];
31
+ tensor<int32, [1, 1, 1]> token_id = reduce_argmax(axis = var_38_axis_0, keep_dims = var_38_keep_dims_0, output_dtype = var_38_output_dtype_0, x = linear_2_cast_fp16)[name = tensor<string, []>("op_38_cast_fp16")];
32
+ tensor<int32, []> var_44 = const()[name = tensor<string, []>("op_44"), val = tensor<int32, []>(-1)];
33
+ tensor<fp16, [1, 1, 1, 1027]> token_probs_all_cast_fp16 = softmax(axis = var_44, x = linear_2_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
34
+ tensor<int32, [1]> var_53_axes_0 = const()[name = tensor<string, []>("op_53_axes_0"), val = tensor<int32, [1]>([-1])];
35
+ tensor<int32, [1, 1, 1, 1]> var_53 = expand_dims(axes = var_53_axes_0, x = token_id)[name = tensor<string, []>("op_53")];
36
+ tensor<int32, []> var_54 = const()[name = tensor<string, []>("op_54"), val = tensor<int32, []>(-1)];
37
+ tensor<bool, []> var_56_validate_indices_0 = const()[name = tensor<string, []>("op_56_validate_indices_0"), val = tensor<bool, []>(false)];
38
+ tensor<string, []> var_53_to_int16_dtype_0 = const()[name = tensor<string, []>("op_53_to_int16_dtype_0"), val = tensor<string, []>("int16")];
39
+ tensor<int16, [1, 1, 1, 1]> var_53_to_int16 = cast(dtype = var_53_to_int16_dtype_0, x = var_53)[name = tensor<string, []>("cast_6")];
40
+ tensor<fp16, [1, 1, 1, 1]> var_56_cast_fp16_cast_int16 = gather_along_axis(axis = var_54, indices = var_53_to_int16, validate_indices = var_56_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_56_cast_fp16_cast_int16")];
41
+ tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
42
+ tensor<fp16, [1, 1, 1]> var_58_cast_fp16 = squeeze(axes = var_58_axes_0, x = var_56_cast_fp16_cast_int16)[name = tensor<string, []>("op_58_cast_fp16")];
43
+ tensor<string, []> var_58_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_58_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
44
+ tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(64)];
45
+ tensor<int32, []> var_63_axis_0 = const()[name = tensor<string, []>("op_63_axis_0"), val = tensor<int32, []>(-1)];
46
+ tensor<bool, []> var_63_ascending_0 = const()[name = tensor<string, []>("op_63_ascending_0"), val = tensor<bool, []>(false)];
47
+ tensor<bool, []> var_63_sort_0 = const()[name = tensor<string, []>("op_63_sort_0"), val = tensor<bool, []>(true)];
48
+ tensor<bool, []> var_63_return_indices_0 = const()[name = tensor<string, []>("op_63_return_indices_0"), val = tensor<bool, []>(true)];
49
+ tensor<string, []> var_63_cast_fp16_cast_int16_output_indices_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_cast_int16_output_indices_dtype_0"), val = tensor<string, []>("uint16")];
50
+ tensor<fp16, [1, 1, 1, 64]> var_63_cast_fp16_cast_int16_0, tensor<uint16, [1, 1, 1, 64]> var_63_cast_fp16_cast_int16_1 = topk(ascending = var_63_ascending_0, axis = var_63_axis_0, k = var_59, output_indices_dtype = var_63_cast_fp16_cast_int16_output_indices_dtype_0, return_indices = var_63_return_indices_0, sort = var_63_sort_0, x = linear_2_cast_fp16)[name = tensor<string, []>("op_63_cast_fp16_cast_int16")];
51
+ tensor<string, []> var_63_cast_fp16_cast_int16_1_to_int32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_cast_int16_1_to_int32_dtype_0"), val = tensor<string, []>("int32")];
52
+ tensor<string, []> var_63_cast_fp16_0_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_0_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
53
+ tensor<fp32, [1, 1, 1, 64]> top_k_logits = cast(dtype = var_63_cast_fp16_0_to_fp32_dtype_0, x = var_63_cast_fp16_cast_int16_0)[name = tensor<string, []>("cast_3")];
54
+ tensor<int32, [1, 1, 1, 64]> top_k_ids = cast(dtype = var_63_cast_fp16_cast_int16_1_to_int32_dtype_0, x = var_63_cast_fp16_cast_int16_1)[name = tensor<string, []>("cast_4")];
55
+ tensor<fp32, [1, 1, 1]> token_prob = cast(dtype = var_58_cast_fp16_to_fp32_dtype_0, x = var_58_cast_fp16)[name = tensor<string, []>("cast_5")];
56
+ } -> (token_id, token_prob, top_k_ids, top_k_logits);
57
+ }
160ms/joint_decision.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039b2010a269153f5a96edf28637f921a86ef8822f248f2d6712f7a6bce84b4
3
+ size 2794182
160ms/joint_decision.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25d4d7be6eeb60c7de1d3a1278a5a4700cbe34017e1a8c1cab33204ddb2e4d5e
3
+ size 8701
160ms/joint_decision.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039b2010a269153f5a96edf28637f921a86ef8822f248f2d6712f7a6bce84b4
3
+ size 2794182
160ms/joint_decision.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "634E266B-4447-41D3-879E-F3611888F54B": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "C7F40527-180B-45CD-BC12-4F054F2E5D9A": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "C7F40527-180B-45CD-BC12-4F054F2E5D9A"
18
+ }
160ms/parakeet_eou_preprocessor.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4ada8b0b99ac1d2ba7acbffacfbbf1a06cb69d30e9410d237ee0aa4c2b0ad63
3
+ size 243