Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

README.md +0 -83
Sortformer.mlmodelc/analytics/coremldata.bin +1 -1
Sortformer.mlmodelc/coremldata.bin +2 -2
Sortformer.mlmodelc/metadata.json +0 -176
Sortformer.mlmodelc/model0/coremldata.bin +2 -2
Sortformer.mlmodelc/model0/model.mil +6 -6
Sortformer.mlmodelc/model1/coremldata.bin +2 -2
Sortformer.mlmodelc/model1/model.mil +6 -6

README.md DELETED Viewed

@@ -1,83 +0,0 @@
----
-license: cc-by-4.0
-tags:
-  - speaker-diarization
-  - coreml
-  - apple-silicon
-  - neural-engine
-  - sortformer
-datasets:
-  - voxconverse
-language:
-  - en
-pipeline_tag: audio-classification
----
-# Sortformer Diarization (CoreML)
-CoreML-converted [NVIDIA Sortformer](https://arxiv.org/abs/2409.06656) model for end-to-end speaker diarization on Apple Silicon.
-Runs on Neural Engine at ~120x real-time.
-## Model Details
-- **Architecture**: Sortformer (Sort Loss + FastConformer-based)
-- **Task**: Speaker diarization (up to 4 speakers)
-- **Input**: 128-dim log-mel features
-- **Output**: Per-frame speaker activity probabilities
-- **Format**: CoreML `.mlmodelc` (compiled, ready to load)
-- **Size**: ~240 MB
-## Streaming Configuration
-| Parameter | Value |
-|-----------|-------|
-| Sample rate | 16 kHz |
-| Mel bins | 128 |
-| n_fft | 400 |
-| Hop length | 160 |
-| Chunk length | 6s |
-| Left context | 1 chunk |
-| Right context | 7 chunks |
-| Subsampling factor | 8 |
-| Max speakers | 4 |
-## Input/Output Shapes
-| Tensor | Shape | Description |
-|--------|-------|-------------|
-| `chunk` | `[1, 112, 128]` | Mel features for current chunk |
-| `chunk_lengths` | `[1]` | Valid frames in chunk |
-| `spkcache` | `[1, 188, 512]` | Speaker cache state |
-| `spkcache_lengths` | `[1]` | Valid entries in speaker cache |
-| `fifo` | `[1, 40, 512]` | FIFO buffer state |
-| `fifo_lengths` | `[1]` | Valid entries in FIFO |
-| **Output** | | |
-| `speaker_preds` | `[242, 4]` | Speaker activity probabilities |
-| `chunk_pre_encoder_embs` | `[94, 512]` | Updated embeddings |
-| `out_spkcache_lengths` | `[1]` | Updated speaker cache length |
-| `out_fifo_lengths` | `[1]` | Updated FIFO length |
-## Usage
-Used by [speech-swift](https://github.com/soniqo/speech-swift) for speaker diarization:
-```bash
-audio diarize meeting.wav --engine sortformer
-```
-```swift
-let diarizer = try await SortformerDiarizer.fromPretrained()
-let result = try diarizer.diarize(audio: samples, sampleRate: 16000)
-for segment in result.segments {
-    print("Speaker \(segment.speakerId): \(segment.startTime)s - \(segment.endTime)s")
-}
-```
-## Original Model
-Converted from [NVIDIA NeMo Sortformer](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/speaker_diarization/configs.html) via coremltools.
-## License
-CC-BY-4.0

Sortformer.mlmodelc/analytics/coremldata.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b1336dfc84d1084140347209f4c2cd1d972ba998d418734a24c0a437dbbcab75
 size 202

 version https://git-lfs.github.com/spec/v1
+oid sha256:e1a18b8f51199d7598c0728aeb6ef5ca344aa2e52880033fac32a795e5b76539
 size 202

Sortformer.mlmodelc/coremldata.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1d69415aa509e8c407dfa64136402232dd02610c41d8c5729446830639373376
-size 1078

 version https://git-lfs.github.com/spec/v1
+oid sha256:29f0563d086d7ea0b8f0461d6e1330d475971fdade05809a52207abf739d9992
+size 1093

Sortformer.mlmodelc/metadata.json DELETED Viewed

@@ -1,176 +0,0 @@
-[
-  {
-    "metadataOutputVersion" : "3.0",
-    "shortDescription" : "CoreML port of Nvidia's Streaming Sortformer diarization model",
-    "outputSchema" : [
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 242 × 4)",
-        "shortDescription" : "Combined speaker probabilities for the speaker  cache, FIFO queue, and chunk",
-        "shape" : "[1, 242, 4]",
-        "name" : "speaker_preds",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 14 × 512)",
-        "shortDescription" : "Speaker embeddings for the new chunk",
-        "shape" : "[1, 14, 512]",
-        "name" : "chunk_pre_encoder_embs",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1)",
-        "shortDescription" : "Number of frames for the new chunk",
-        "shape" : "[1]",
-        "name" : "chunk_pre_encoder_lengths",
-        "type" : "MultiArray"
-      }
-    ],
-    "storagePrecision" : "Mixed (Float16, Float32)",
-    "modelParameters" : [
-    ],
-    "author" : "Benjamin Lee",
-    "specificationVersion" : 7,
-    "license" : "MIT",
-    "mlProgramOperationTypeHistogram" : {
-      "Ios16.floorDiv" : 3,
-      "Transpose" : 193,
-      "Identity" : 2,
-      "Ios16.softmax" : 35,
-      "Ios16.gatherAlongAxis" : 1,
-      "Split" : 17,
-      "Ios16.linear" : 248,
-      "Ios16.add" : 186,
-      "Concat" : 1,
-      "Ios16.greaterEqual" : 2,
-      "Tile" : 9,
-      "Select" : 51,
-      "Ios16.minimum" : 1,
-      "Ios16.sigmoid" : 18,
-      "Ios16.logicalAnd" : 2,
-      "Pad" : 34,
-      "ExpandDims" : 25,
-      "Ios16.sub" : 6,
-      "Ios16.cast" : 16,
-      "Ios16.less" : 7,
-      "Ios16.conv" : 56,
-      "Ios16.relu" : 23,
-      "Ios16.reshape" : 175,
-      "Ios16.matmul" : 87,
-      "Ios16.maximum" : 1,
-      "Ios16.layerNorm" : 121,
-      "SliceByIndex" : 34,
-      "Ios16.silu" : 51,
-      "Ios16.mul" : 119,
-      "Ios16.logicalNot" : 2
-    },
-    "computePrecision" : "Mixed (Float16, Float32, Int32)",
-    "stateSchema" : [
-    ],
-    "isUpdatable" : "0",
-    "availability" : {
-      "macOS" : "13.0",
-      "tvOS" : "16.0",
-      "visionOS" : "1.0",
-      "watchOS" : "9.0",
-      "iOS" : "16.0",
-      "macCatalyst" : "16.0"
-    },
-    "modelType" : {
-      "name" : "MLModelType_pipeline",
-      "structure" : [
-        {
-          "name" : "MLModelType_mlProgram"
-        },
-        {
-          "name" : "MLModelType_mlProgram"
-        }
-      ]
-    },
-    "inputSchema" : [
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 112 × 128)",
-        "shortDescription" : "Mel spectrogram features for the new chunk",
-        "shape" : "[1, 112, 128]",
-        "name" : "chunk",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1)",
-        "shortDescription" : "Length of the new chunk",
-        "shape" : "[1]",
-        "name" : "chunk_lengths",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 188 × 512)",
-        "shortDescription" : "Order of Arrival Speaker Cache",
-        "shape" : "[1, 188, 512]",
-        "name" : "spkcache",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1)",
-        "shortDescription" : "Length of the speaker cache (in frames)",
-        "shape" : "[1]",
-        "name" : "spkcache_lengths",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 40 × 512)",
-        "shortDescription" : "First-In-First-Out speech queue",
-        "shape" : "[1, 40, 512]",
-        "name" : "fifo",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1)",
-        "shortDescription" : "Length of the FIFO queue (in frames)",
-        "shape" : "[1]",
-        "name" : "fifo_lengths",
-        "type" : "MultiArray"
-      }
-    ],
-    "userDefinedMetadata" : {
-      "frame_duration" : "0.08",
-      "spkcache_update_period" : "31",
-      "chunk_len" : "6",
-      "subsampling_factor" : "8",
-      "chunk_right_context" : "7",
-      "mel_feature_frames" : "48",
-      "fifo_len" : "40",
-      "chunk_left_context" : "1",
-      "spkcache_len" : "188"
-    },
-    "generatedClassName" : "Sortformer",
-    "method" : "predict"
-  }
-]

Sortformer.mlmodelc/model0/coremldata.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:94cc859be2d5514e057506ea9bed05ac455fb72ebf05a52e4aeddf5dea80ceb1
-size 632

 version https://git-lfs.github.com/spec/v1
+oid sha256:1221dd5d8951643b2226e5d977d6400316d61c752cfd4b7c673646c29be386e4
+size 640

Sortformer.mlmodelc/model0/model.mil CHANGED Viewed

@@ -1,5 +1,5 @@
 program(1.0)
-[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
     func main<ios16>(tensor<fp32, [1, 112, 128]> chunk, tensor<int32, [1]> chunk_lengths, tensor<fp32, [1, 40, 512]> fifo, tensor<int32, [1]> fifo_lengths, tensor<fp32, [1, 188, 512]> spkcache, tensor<int32, [1]> spkcache_lengths) {
             tensor<fp32, [256]> model_encoder_pre_encode_conv_0_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/0-weight.bin"), offset = tensor<uint64, []>(64)))];
@@ -154,16 +154,16 @@ program(1.0)
             tensor<int32, [3]> var_229 = const()[name = tensor<string, []>("op_229"), val = tensor<int32, [3]>([1, 14, -1])];
             tensor<fp32, [1, 14, 256, 16]> var_228 = transpose(perm = var_228_perm_0, x = x)[name = tensor<string, []>("transpose_0")];
             tensor<fp32, [1, 14, 4096]> input = reshape(shape = var_229, x = var_228)[name = tensor<string, []>("input")];
-            tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs = linear(bias = model_encoder_pre_encode_out_bias, weight = model_encoder_pre_encode_out_weight, x = input)[name = tensor<string, []>("linear_0")];
             tensor<string, []> var_241_dtype_0 = const()[name = tensor<string, []>("op_241_dtype_0"), val = tensor<string, []>("int32")];
             tensor<int32, [1]> size0 = const()[name = tensor<string, []>("size0"), val = tensor<int32, [1]>([188])];
             tensor<int32, [1]> size1 = const()[name = tensor<string, []>("size1"), val = tensor<int32, [1]>([40])];
             tensor<int32, []> var_264 = const()[name = tensor<string, []>("op_264"), val = tensor<int32, []>(1)];
             tensor<bool, []> full_concat_interleave_0 = const()[name = tensor<string, []>("full_concat_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp32, [1, 242, 512]> full_concat = concat(axis = var_264, interleave = full_concat_interleave_0, values = (spkcache, fifo, chunk_pre_encoder_embs))[name = tensor<string, []>("full_concat")];
             tensor<int32, [1]> var_273 = add(x = spkcache_lengths, y = fifo_lengths)[name = tensor<string, []>("op_273")];
-            tensor<int32, [1]> chunk_pre_encoder_lengths = cast(dtype = var_241_dtype_0, x = current_lengths)[name = tensor<string, []>("cast_4")];
-            tensor<int32, [1]> pre_encoder_lengths = add(x = var_273, y = chunk_pre_encoder_lengths)[name = tensor<string, []>("total_length")];
             tensor<int32, [242]> out_pos = const()[name = tensor<string, []>("out_pos"), val = tensor<int32, [242]>([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241])];
             tensor<bool, [242]> var_284 = greater_equal(x = out_pos, y = spkcache_lengths)[name = tensor<string, []>("op_284")];
             tensor<string, []> in_seg1_or_2_dtype_0 = const()[name = tensor<string, []>("in_seg1_or_2_dtype_0"), val = tensor<string, []>("int32")];
@@ -197,5 +197,5 @@ program(1.0)
             tensor<int32, [1]> var_332_axes_0 = const()[name = tensor<string, []>("op_332_axes_0"), val = tensor<int32, [1]>([-1])];
             tensor<fp32, [1, 242, 1]> var_332 = expand_dims(axes = var_332_axes_0, x = var_330)[name = tensor<string, []>("op_332")];
             tensor<fp32, [1, 242, 512]> pre_encoder_embs = mul(x = output, y = var_332)[name = tensor<string, []>("op_333")];
-        } -> (pre_encoder_embs, pre_encoder_lengths, chunk_pre_encoder_embs, chunk_pre_encoder_lengths);
 }

 program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3405.2.1"}, {"coremlc-version", "3405.2.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
     func main<ios16>(tensor<fp32, [1, 112, 128]> chunk, tensor<int32, [1]> chunk_lengths, tensor<fp32, [1, 40, 512]> fifo, tensor<int32, [1]> fifo_lengths, tensor<fp32, [1, 188, 512]> spkcache, tensor<int32, [1]> spkcache_lengths) {
             tensor<fp32, [256]> model_encoder_pre_encode_conv_0_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/0-weight.bin"), offset = tensor<uint64, []>(64)))];
             tensor<int32, [3]> var_229 = const()[name = tensor<string, []>("op_229"), val = tensor<int32, [3]>([1, 14, -1])];
             tensor<fp32, [1, 14, 256, 16]> var_228 = transpose(perm = var_228_perm_0, x = x)[name = tensor<string, []>("transpose_0")];
             tensor<fp32, [1, 14, 4096]> input = reshape(shape = var_229, x = var_228)[name = tensor<string, []>("input")];
+            tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs_out = linear(bias = model_encoder_pre_encode_out_bias, weight = model_encoder_pre_encode_out_weight, x = input)[name = tensor<string, []>("linear_0")];
             tensor<string, []> var_241_dtype_0 = const()[name = tensor<string, []>("op_241_dtype_0"), val = tensor<string, []>("int32")];
             tensor<int32, [1]> size0 = const()[name = tensor<string, []>("size0"), val = tensor<int32, [1]>([188])];
             tensor<int32, [1]> size1 = const()[name = tensor<string, []>("size1"), val = tensor<int32, [1]>([40])];
             tensor<int32, []> var_264 = const()[name = tensor<string, []>("op_264"), val = tensor<int32, []>(1)];
             tensor<bool, []> full_concat_interleave_0 = const()[name = tensor<string, []>("full_concat_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 242, 512]> full_concat = concat(axis = var_264, interleave = full_concat_interleave_0, values = (spkcache, fifo, chunk_pre_encoder_embs_out))[name = tensor<string, []>("full_concat")];
             tensor<int32, [1]> var_273 = add(x = spkcache_lengths, y = fifo_lengths)[name = tensor<string, []>("op_273")];
+            tensor<int32, [1]> chunk_pre_encoder_lengths_out = cast(dtype = var_241_dtype_0, x = current_lengths)[name = tensor<string, []>("cast_4")];
+            tensor<int32, [1]> pre_encoder_lengths = add(x = var_273, y = chunk_pre_encoder_lengths_out)[name = tensor<string, []>("total_length")];
             tensor<int32, [242]> out_pos = const()[name = tensor<string, []>("out_pos"), val = tensor<int32, [242]>([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241])];
             tensor<bool, [242]> var_284 = greater_equal(x = out_pos, y = spkcache_lengths)[name = tensor<string, []>("op_284")];
             tensor<string, []> in_seg1_or_2_dtype_0 = const()[name = tensor<string, []>("in_seg1_or_2_dtype_0"), val = tensor<string, []>("int32")];
             tensor<int32, [1]> var_332_axes_0 = const()[name = tensor<string, []>("op_332_axes_0"), val = tensor<int32, [1]>([-1])];
             tensor<fp32, [1, 242, 1]> var_332 = expand_dims(axes = var_332_axes_0, x = var_330)[name = tensor<string, []>("op_332")];
             tensor<fp32, [1, 242, 512]> pre_encoder_embs = mul(x = output, y = var_332)[name = tensor<string, []>("op_333")];
+        } -> (pre_encoder_embs, pre_encoder_lengths, chunk_pre_encoder_embs_out, chunk_pre_encoder_lengths_out);
 }

Sortformer.mlmodelc/model1/coremldata.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e613306cf96ba390ac0ebb449947c8b4dc344a6a9e3be69b8220ad1caa74056b
-size 585

 version https://git-lfs.github.com/spec/v1
+oid sha256:efacb38511f55c820a24313fb191341db87f05466ec6dd855203d8f6c126aba3
+size 605

Sortformer.mlmodelc/model1/model.mil CHANGED Viewed

@@ -1,7 +1,7 @@
 program(1.0)
-[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
-    func main<ios16>(tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs, tensor<int32, [1]> chunk_pre_encoder_lengths, tensor<fp32, [1, 242, 512]> pre_encoder_embs, tensor<int32, [1]> pre_encoder_lengths) {
             tensor<int32, []> var_30 = const()[name = tensor<string, []>("op_30"), val = tensor<int32, []>(-1)];
             tensor<string, []> pre_encoder_embs_to_fp16_dtype_0 = const()[name = tensor<string, []>("pre_encoder_embs_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
             tensor<fp16, []> var_77_to_fp16 = const()[name = tensor<string, []>("op_77_to_fp16"), val = tensor<fp16, []>(0x1.6ap+4)];
@@ -3587,8 +3587,8 @@ program(1.0)
             tensor<fp16, [1, 242, 1]> var_4765_to_fp16 = cast(dtype = var_4765_promoted_to_fp16_dtype_0, x = var_4765)[name = tensor<string, []>("cast_302")];
             tensor<fp16, [1, 242, 4]> var_4766_cast_fp16 = mul(x = _preds_cast_fp16, y = var_4765_to_fp16)[name = tensor<string, []>("op_4766_cast_fp16")];
             tensor<string, []> var_4766_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_4766_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
-            tensor<fp32, [1, 242, 4]> speaker_preds = cast(dtype = var_4766_cast_fp16_to_fp32_dtype_0, x = var_4766_cast_fp16)[name = tensor<string, []>("cast_301")];
-            tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs_tmp = identity(x = chunk_pre_encoder_embs)[name = tensor<string, []>("chunk_pre_encoder_embs_tmp")];
-            tensor<int32, [1]> chunk_pre_encoder_lengths_tmp = identity(x = chunk_pre_encoder_lengths)[name = tensor<string, []>("chunk_pre_encoder_lengths_tmp")];
-        } -> (speaker_preds, chunk_pre_encoder_embs, chunk_pre_encoder_lengths);
 }

 program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3405.2.1"}, {"coremlc-version", "3405.2.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
 {
+    func main<ios16>(tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs_out, tensor<int32, [1]> chunk_pre_encoder_lengths_out, tensor<fp32, [1, 242, 512]> pre_encoder_embs, tensor<int32, [1]> pre_encoder_lengths) {
             tensor<int32, []> var_30 = const()[name = tensor<string, []>("op_30"), val = tensor<int32, []>(-1)];
             tensor<string, []> pre_encoder_embs_to_fp16_dtype_0 = const()[name = tensor<string, []>("pre_encoder_embs_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
             tensor<fp16, []> var_77_to_fp16 = const()[name = tensor<string, []>("op_77_to_fp16"), val = tensor<fp16, []>(0x1.6ap+4)];
             tensor<fp16, [1, 242, 1]> var_4765_to_fp16 = cast(dtype = var_4765_promoted_to_fp16_dtype_0, x = var_4765)[name = tensor<string, []>("cast_302")];
             tensor<fp16, [1, 242, 4]> var_4766_cast_fp16 = mul(x = _preds_cast_fp16, y = var_4765_to_fp16)[name = tensor<string, []>("op_4766_cast_fp16")];
             tensor<string, []> var_4766_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_4766_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<fp32, [1, 242, 4]> speaker_preds_out = cast(dtype = var_4766_cast_fp16_to_fp32_dtype_0, x = var_4766_cast_fp16)[name = tensor<string, []>("cast_301")];
+            tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs_tmp = identity(x = chunk_pre_encoder_embs_out)[name = tensor<string, []>("chunk_pre_encoder_embs_tmp")];
+            tensor<int32, [1]> chunk_pre_encoder_lengths_tmp = identity(x = chunk_pre_encoder_lengths_out)[name = tensor<string, []>("chunk_pre_encoder_lengths_tmp")];
+        } -> (speaker_preds_out, chunk_pre_encoder_embs_out, chunk_pre_encoder_lengths_out);
 }