aufklarer commited on
Commit
070f513
·
verified ·
1 Parent(s): c2084e9

Upload folder using huggingface_hub

Browse files
README.md DELETED
@@ -1,83 +0,0 @@
1
- ---
2
- license: cc-by-4.0
3
- tags:
4
- - speaker-diarization
5
- - coreml
6
- - apple-silicon
7
- - neural-engine
8
- - sortformer
9
- datasets:
10
- - voxconverse
11
- language:
12
- - en
13
- pipeline_tag: audio-classification
14
- ---
15
-
16
- # Sortformer Diarization (CoreML)
17
-
18
- CoreML-converted [NVIDIA Sortformer](https://arxiv.org/abs/2409.06656) model for end-to-end speaker diarization on Apple Silicon.
19
-
20
- Runs on Neural Engine at ~120x real-time.
21
-
22
- ## Model Details
23
-
24
- - **Architecture**: Sortformer (Sort Loss + FastConformer-based)
25
- - **Task**: Speaker diarization (up to 4 speakers)
26
- - **Input**: 128-dim log-mel features
27
- - **Output**: Per-frame speaker activity probabilities
28
- - **Format**: CoreML `.mlmodelc` (compiled, ready to load)
29
- - **Size**: ~240 MB
30
-
31
- ## Streaming Configuration
32
-
33
- | Parameter | Value |
34
- |-----------|-------|
35
- | Sample rate | 16 kHz |
36
- | Mel bins | 128 |
37
- | n_fft | 400 |
38
- | Hop length | 160 |
39
- | Chunk length | 6s |
40
- | Left context | 1 chunk |
41
- | Right context | 7 chunks |
42
- | Subsampling factor | 8 |
43
- | Max speakers | 4 |
44
-
45
- ## Input/Output Shapes
46
-
47
- | Tensor | Shape | Description |
48
- |--------|-------|-------------|
49
- | `chunk` | `[1, 112, 128]` | Mel features for current chunk |
50
- | `chunk_lengths` | `[1]` | Valid frames in chunk |
51
- | `spkcache` | `[1, 188, 512]` | Speaker cache state |
52
- | `spkcache_lengths` | `[1]` | Valid entries in speaker cache |
53
- | `fifo` | `[1, 40, 512]` | FIFO buffer state |
54
- | `fifo_lengths` | `[1]` | Valid entries in FIFO |
55
- | **Output** | | |
56
- | `speaker_preds` | `[242, 4]` | Speaker activity probabilities |
57
- | `chunk_pre_encoder_embs` | `[94, 512]` | Updated embeddings |
58
- | `out_spkcache_lengths` | `[1]` | Updated speaker cache length |
59
- | `out_fifo_lengths` | `[1]` | Updated FIFO length |
60
-
61
- ## Usage
62
-
63
- Used by [speech-swift](https://github.com/soniqo/speech-swift) for speaker diarization:
64
-
65
- ```bash
66
- audio diarize meeting.wav --engine sortformer
67
- ```
68
-
69
- ```swift
70
- let diarizer = try await SortformerDiarizer.fromPretrained()
71
- let result = try diarizer.diarize(audio: samples, sampleRate: 16000)
72
- for segment in result.segments {
73
- print("Speaker \(segment.speakerId): \(segment.startTime)s - \(segment.endTime)s")
74
- }
75
- ```
76
-
77
- ## Original Model
78
-
79
- Converted from [NVIDIA NeMo Sortformer](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/speaker_diarization/configs.html) via coremltools.
80
-
81
- ## License
82
-
83
- CC-BY-4.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Sortformer.mlmodelc/analytics/coremldata.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1336dfc84d1084140347209f4c2cd1d972ba998d418734a24c0a437dbbcab75
3
  size 202
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1a18b8f51199d7598c0728aeb6ef5ca344aa2e52880033fac32a795e5b76539
3
  size 202
Sortformer.mlmodelc/coremldata.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d69415aa509e8c407dfa64136402232dd02610c41d8c5729446830639373376
3
- size 1078
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29f0563d086d7ea0b8f0461d6e1330d475971fdade05809a52207abf739d9992
3
+ size 1093
Sortformer.mlmodelc/metadata.json DELETED
@@ -1,176 +0,0 @@
1
- [
2
- {
3
- "metadataOutputVersion" : "3.0",
4
- "shortDescription" : "CoreML port of Nvidia's Streaming Sortformer diarization model",
5
- "outputSchema" : [
6
- {
7
- "hasShapeFlexibility" : "0",
8
- "isOptional" : "0",
9
- "dataType" : "Float32",
10
- "formattedType" : "MultiArray (Float32 1 × 242 × 4)",
11
- "shortDescription" : "Combined speaker probabilities for the speaker cache, FIFO queue, and chunk",
12
- "shape" : "[1, 242, 4]",
13
- "name" : "speaker_preds",
14
- "type" : "MultiArray"
15
- },
16
- {
17
- "hasShapeFlexibility" : "0",
18
- "isOptional" : "0",
19
- "dataType" : "Float32",
20
- "formattedType" : "MultiArray (Float32 1 × 14 × 512)",
21
- "shortDescription" : "Speaker embeddings for the new chunk",
22
- "shape" : "[1, 14, 512]",
23
- "name" : "chunk_pre_encoder_embs",
24
- "type" : "MultiArray"
25
- },
26
- {
27
- "hasShapeFlexibility" : "0",
28
- "isOptional" : "0",
29
- "dataType" : "Int32",
30
- "formattedType" : "MultiArray (Int32 1)",
31
- "shortDescription" : "Number of frames for the new chunk",
32
- "shape" : "[1]",
33
- "name" : "chunk_pre_encoder_lengths",
34
- "type" : "MultiArray"
35
- }
36
- ],
37
- "storagePrecision" : "Mixed (Float16, Float32)",
38
- "modelParameters" : [
39
-
40
- ],
41
- "author" : "Benjamin Lee",
42
- "specificationVersion" : 7,
43
- "license" : "MIT",
44
- "mlProgramOperationTypeHistogram" : {
45
- "Ios16.floorDiv" : 3,
46
- "Transpose" : 193,
47
- "Identity" : 2,
48
- "Ios16.softmax" : 35,
49
- "Ios16.gatherAlongAxis" : 1,
50
- "Split" : 17,
51
- "Ios16.linear" : 248,
52
- "Ios16.add" : 186,
53
- "Concat" : 1,
54
- "Ios16.greaterEqual" : 2,
55
- "Tile" : 9,
56
- "Select" : 51,
57
- "Ios16.minimum" : 1,
58
- "Ios16.sigmoid" : 18,
59
- "Ios16.logicalAnd" : 2,
60
- "Pad" : 34,
61
- "ExpandDims" : 25,
62
- "Ios16.sub" : 6,
63
- "Ios16.cast" : 16,
64
- "Ios16.less" : 7,
65
- "Ios16.conv" : 56,
66
- "Ios16.relu" : 23,
67
- "Ios16.reshape" : 175,
68
- "Ios16.matmul" : 87,
69
- "Ios16.maximum" : 1,
70
- "Ios16.layerNorm" : 121,
71
- "SliceByIndex" : 34,
72
- "Ios16.silu" : 51,
73
- "Ios16.mul" : 119,
74
- "Ios16.logicalNot" : 2
75
- },
76
- "computePrecision" : "Mixed (Float16, Float32, Int32)",
77
- "stateSchema" : [
78
-
79
- ],
80
- "isUpdatable" : "0",
81
- "availability" : {
82
- "macOS" : "13.0",
83
- "tvOS" : "16.0",
84
- "visionOS" : "1.0",
85
- "watchOS" : "9.0",
86
- "iOS" : "16.0",
87
- "macCatalyst" : "16.0"
88
- },
89
- "modelType" : {
90
- "name" : "MLModelType_pipeline",
91
- "structure" : [
92
- {
93
- "name" : "MLModelType_mlProgram"
94
- },
95
- {
96
- "name" : "MLModelType_mlProgram"
97
- }
98
- ]
99
- },
100
- "inputSchema" : [
101
- {
102
- "hasShapeFlexibility" : "0",
103
- "isOptional" : "0",
104
- "dataType" : "Float32",
105
- "formattedType" : "MultiArray (Float32 1 × 112 × 128)",
106
- "shortDescription" : "Mel spectrogram features for the new chunk",
107
- "shape" : "[1, 112, 128]",
108
- "name" : "chunk",
109
- "type" : "MultiArray"
110
- },
111
- {
112
- "hasShapeFlexibility" : "0",
113
- "isOptional" : "0",
114
- "dataType" : "Int32",
115
- "formattedType" : "MultiArray (Int32 1)",
116
- "shortDescription" : "Length of the new chunk",
117
- "shape" : "[1]",
118
- "name" : "chunk_lengths",
119
- "type" : "MultiArray"
120
- },
121
- {
122
- "hasShapeFlexibility" : "0",
123
- "isOptional" : "0",
124
- "dataType" : "Float32",
125
- "formattedType" : "MultiArray (Float32 1 × 188 × 512)",
126
- "shortDescription" : "Order of Arrival Speaker Cache",
127
- "shape" : "[1, 188, 512]",
128
- "name" : "spkcache",
129
- "type" : "MultiArray"
130
- },
131
- {
132
- "hasShapeFlexibility" : "0",
133
- "isOptional" : "0",
134
- "dataType" : "Int32",
135
- "formattedType" : "MultiArray (Int32 1)",
136
- "shortDescription" : "Length of the speaker cache (in frames)",
137
- "shape" : "[1]",
138
- "name" : "spkcache_lengths",
139
- "type" : "MultiArray"
140
- },
141
- {
142
- "hasShapeFlexibility" : "0",
143
- "isOptional" : "0",
144
- "dataType" : "Float32",
145
- "formattedType" : "MultiArray (Float32 1 × 40 × 512)",
146
- "shortDescription" : "First-In-First-Out speech queue",
147
- "shape" : "[1, 40, 512]",
148
- "name" : "fifo",
149
- "type" : "MultiArray"
150
- },
151
- {
152
- "hasShapeFlexibility" : "0",
153
- "isOptional" : "0",
154
- "dataType" : "Int32",
155
- "formattedType" : "MultiArray (Int32 1)",
156
- "shortDescription" : "Length of the FIFO queue (in frames)",
157
- "shape" : "[1]",
158
- "name" : "fifo_lengths",
159
- "type" : "MultiArray"
160
- }
161
- ],
162
- "userDefinedMetadata" : {
163
- "frame_duration" : "0.08",
164
- "spkcache_update_period" : "31",
165
- "chunk_len" : "6",
166
- "subsampling_factor" : "8",
167
- "chunk_right_context" : "7",
168
- "mel_feature_frames" : "48",
169
- "fifo_len" : "40",
170
- "chunk_left_context" : "1",
171
- "spkcache_len" : "188"
172
- },
173
- "generatedClassName" : "Sortformer",
174
- "method" : "predict"
175
- }
176
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Sortformer.mlmodelc/model0/coremldata.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:94cc859be2d5514e057506ea9bed05ac455fb72ebf05a52e4aeddf5dea80ceb1
3
- size 632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1221dd5d8951643b2226e5d977d6400316d61c752cfd4b7c673646c29be386e4
3
+ size 640
Sortformer.mlmodelc/model0/model.mil CHANGED
@@ -1,5 +1,5 @@
1
  program(1.0)
2
- [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
3
  {
4
  func main<ios16>(tensor<fp32, [1, 112, 128]> chunk, tensor<int32, [1]> chunk_lengths, tensor<fp32, [1, 40, 512]> fifo, tensor<int32, [1]> fifo_lengths, tensor<fp32, [1, 188, 512]> spkcache, tensor<int32, [1]> spkcache_lengths) {
5
  tensor<fp32, [256]> model_encoder_pre_encode_conv_0_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/0-weight.bin"), offset = tensor<uint64, []>(64)))];
@@ -154,16 +154,16 @@ program(1.0)
154
  tensor<int32, [3]> var_229 = const()[name = tensor<string, []>("op_229"), val = tensor<int32, [3]>([1, 14, -1])];
155
  tensor<fp32, [1, 14, 256, 16]> var_228 = transpose(perm = var_228_perm_0, x = x)[name = tensor<string, []>("transpose_0")];
156
  tensor<fp32, [1, 14, 4096]> input = reshape(shape = var_229, x = var_228)[name = tensor<string, []>("input")];
157
- tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs = linear(bias = model_encoder_pre_encode_out_bias, weight = model_encoder_pre_encode_out_weight, x = input)[name = tensor<string, []>("linear_0")];
158
  tensor<string, []> var_241_dtype_0 = const()[name = tensor<string, []>("op_241_dtype_0"), val = tensor<string, []>("int32")];
159
  tensor<int32, [1]> size0 = const()[name = tensor<string, []>("size0"), val = tensor<int32, [1]>([188])];
160
  tensor<int32, [1]> size1 = const()[name = tensor<string, []>("size1"), val = tensor<int32, [1]>([40])];
161
  tensor<int32, []> var_264 = const()[name = tensor<string, []>("op_264"), val = tensor<int32, []>(1)];
162
  tensor<bool, []> full_concat_interleave_0 = const()[name = tensor<string, []>("full_concat_interleave_0"), val = tensor<bool, []>(false)];
163
- tensor<fp32, [1, 242, 512]> full_concat = concat(axis = var_264, interleave = full_concat_interleave_0, values = (spkcache, fifo, chunk_pre_encoder_embs))[name = tensor<string, []>("full_concat")];
164
  tensor<int32, [1]> var_273 = add(x = spkcache_lengths, y = fifo_lengths)[name = tensor<string, []>("op_273")];
165
- tensor<int32, [1]> chunk_pre_encoder_lengths = cast(dtype = var_241_dtype_0, x = current_lengths)[name = tensor<string, []>("cast_4")];
166
- tensor<int32, [1]> pre_encoder_lengths = add(x = var_273, y = chunk_pre_encoder_lengths)[name = tensor<string, []>("total_length")];
167
  tensor<int32, [242]> out_pos = const()[name = tensor<string, []>("out_pos"), val = tensor<int32, [242]>([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241])];
168
  tensor<bool, [242]> var_284 = greater_equal(x = out_pos, y = spkcache_lengths)[name = tensor<string, []>("op_284")];
169
  tensor<string, []> in_seg1_or_2_dtype_0 = const()[name = tensor<string, []>("in_seg1_or_2_dtype_0"), val = tensor<string, []>("int32")];
@@ -197,5 +197,5 @@ program(1.0)
197
  tensor<int32, [1]> var_332_axes_0 = const()[name = tensor<string, []>("op_332_axes_0"), val = tensor<int32, [1]>([-1])];
198
  tensor<fp32, [1, 242, 1]> var_332 = expand_dims(axes = var_332_axes_0, x = var_330)[name = tensor<string, []>("op_332")];
199
  tensor<fp32, [1, 242, 512]> pre_encoder_embs = mul(x = output, y = var_332)[name = tensor<string, []>("op_333")];
200
- } -> (pre_encoder_embs, pre_encoder_lengths, chunk_pre_encoder_embs, chunk_pre_encoder_lengths);
201
  }
 
1
  program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3405.2.1"}, {"coremlc-version", "3405.2.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
3
  {
4
  func main<ios16>(tensor<fp32, [1, 112, 128]> chunk, tensor<int32, [1]> chunk_lengths, tensor<fp32, [1, 40, 512]> fifo, tensor<int32, [1]> fifo_lengths, tensor<fp32, [1, 188, 512]> spkcache, tensor<int32, [1]> spkcache_lengths) {
5
  tensor<fp32, [256]> model_encoder_pre_encode_conv_0_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/0-weight.bin"), offset = tensor<uint64, []>(64)))];
 
154
  tensor<int32, [3]> var_229 = const()[name = tensor<string, []>("op_229"), val = tensor<int32, [3]>([1, 14, -1])];
155
  tensor<fp32, [1, 14, 256, 16]> var_228 = transpose(perm = var_228_perm_0, x = x)[name = tensor<string, []>("transpose_0")];
156
  tensor<fp32, [1, 14, 4096]> input = reshape(shape = var_229, x = var_228)[name = tensor<string, []>("input")];
157
+ tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs_out = linear(bias = model_encoder_pre_encode_out_bias, weight = model_encoder_pre_encode_out_weight, x = input)[name = tensor<string, []>("linear_0")];
158
  tensor<string, []> var_241_dtype_0 = const()[name = tensor<string, []>("op_241_dtype_0"), val = tensor<string, []>("int32")];
159
  tensor<int32, [1]> size0 = const()[name = tensor<string, []>("size0"), val = tensor<int32, [1]>([188])];
160
  tensor<int32, [1]> size1 = const()[name = tensor<string, []>("size1"), val = tensor<int32, [1]>([40])];
161
  tensor<int32, []> var_264 = const()[name = tensor<string, []>("op_264"), val = tensor<int32, []>(1)];
162
  tensor<bool, []> full_concat_interleave_0 = const()[name = tensor<string, []>("full_concat_interleave_0"), val = tensor<bool, []>(false)];
163
+ tensor<fp32, [1, 242, 512]> full_concat = concat(axis = var_264, interleave = full_concat_interleave_0, values = (spkcache, fifo, chunk_pre_encoder_embs_out))[name = tensor<string, []>("full_concat")];
164
  tensor<int32, [1]> var_273 = add(x = spkcache_lengths, y = fifo_lengths)[name = tensor<string, []>("op_273")];
165
+ tensor<int32, [1]> chunk_pre_encoder_lengths_out = cast(dtype = var_241_dtype_0, x = current_lengths)[name = tensor<string, []>("cast_4")];
166
+ tensor<int32, [1]> pre_encoder_lengths = add(x = var_273, y = chunk_pre_encoder_lengths_out)[name = tensor<string, []>("total_length")];
167
  tensor<int32, [242]> out_pos = const()[name = tensor<string, []>("out_pos"), val = tensor<int32, [242]>([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241])];
168
  tensor<bool, [242]> var_284 = greater_equal(x = out_pos, y = spkcache_lengths)[name = tensor<string, []>("op_284")];
169
  tensor<string, []> in_seg1_or_2_dtype_0 = const()[name = tensor<string, []>("in_seg1_or_2_dtype_0"), val = tensor<string, []>("int32")];
 
197
  tensor<int32, [1]> var_332_axes_0 = const()[name = tensor<string, []>("op_332_axes_0"), val = tensor<int32, [1]>([-1])];
198
  tensor<fp32, [1, 242, 1]> var_332 = expand_dims(axes = var_332_axes_0, x = var_330)[name = tensor<string, []>("op_332")];
199
  tensor<fp32, [1, 242, 512]> pre_encoder_embs = mul(x = output, y = var_332)[name = tensor<string, []>("op_333")];
200
+ } -> (pre_encoder_embs, pre_encoder_lengths, chunk_pre_encoder_embs_out, chunk_pre_encoder_lengths_out);
201
  }
Sortformer.mlmodelc/model1/coremldata.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e613306cf96ba390ac0ebb449947c8b4dc344a6a9e3be69b8220ad1caa74056b
3
- size 585
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efacb38511f55c820a24313fb191341db87f05466ec6dd855203d8f6c126aba3
3
+ size 605
Sortformer.mlmodelc/model1/model.mil CHANGED
@@ -1,7 +1,7 @@
1
  program(1.0)
2
- [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
3
  {
4
- func main<ios16>(tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs, tensor<int32, [1]> chunk_pre_encoder_lengths, tensor<fp32, [1, 242, 512]> pre_encoder_embs, tensor<int32, [1]> pre_encoder_lengths) {
5
  tensor<int32, []> var_30 = const()[name = tensor<string, []>("op_30"), val = tensor<int32, []>(-1)];
6
  tensor<string, []> pre_encoder_embs_to_fp16_dtype_0 = const()[name = tensor<string, []>("pre_encoder_embs_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
7
  tensor<fp16, []> var_77_to_fp16 = const()[name = tensor<string, []>("op_77_to_fp16"), val = tensor<fp16, []>(0x1.6ap+4)];
@@ -3587,8 +3587,8 @@ program(1.0)
3587
  tensor<fp16, [1, 242, 1]> var_4765_to_fp16 = cast(dtype = var_4765_promoted_to_fp16_dtype_0, x = var_4765)[name = tensor<string, []>("cast_302")];
3588
  tensor<fp16, [1, 242, 4]> var_4766_cast_fp16 = mul(x = _preds_cast_fp16, y = var_4765_to_fp16)[name = tensor<string, []>("op_4766_cast_fp16")];
3589
  tensor<string, []> var_4766_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_4766_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
3590
- tensor<fp32, [1, 242, 4]> speaker_preds = cast(dtype = var_4766_cast_fp16_to_fp32_dtype_0, x = var_4766_cast_fp16)[name = tensor<string, []>("cast_301")];
3591
- tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs_tmp = identity(x = chunk_pre_encoder_embs)[name = tensor<string, []>("chunk_pre_encoder_embs_tmp")];
3592
- tensor<int32, [1]> chunk_pre_encoder_lengths_tmp = identity(x = chunk_pre_encoder_lengths)[name = tensor<string, []>("chunk_pre_encoder_lengths_tmp")];
3593
- } -> (speaker_preds, chunk_pre_encoder_embs, chunk_pre_encoder_lengths);
3594
  }
 
1
  program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3405.2.1"}, {"coremlc-version", "3405.2.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
3
  {
4
+ func main<ios16>(tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs_out, tensor<int32, [1]> chunk_pre_encoder_lengths_out, tensor<fp32, [1, 242, 512]> pre_encoder_embs, tensor<int32, [1]> pre_encoder_lengths) {
5
  tensor<int32, []> var_30 = const()[name = tensor<string, []>("op_30"), val = tensor<int32, []>(-1)];
6
  tensor<string, []> pre_encoder_embs_to_fp16_dtype_0 = const()[name = tensor<string, []>("pre_encoder_embs_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
7
  tensor<fp16, []> var_77_to_fp16 = const()[name = tensor<string, []>("op_77_to_fp16"), val = tensor<fp16, []>(0x1.6ap+4)];
 
3587
  tensor<fp16, [1, 242, 1]> var_4765_to_fp16 = cast(dtype = var_4765_promoted_to_fp16_dtype_0, x = var_4765)[name = tensor<string, []>("cast_302")];
3588
  tensor<fp16, [1, 242, 4]> var_4766_cast_fp16 = mul(x = _preds_cast_fp16, y = var_4765_to_fp16)[name = tensor<string, []>("op_4766_cast_fp16")];
3589
  tensor<string, []> var_4766_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_4766_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
3590
+ tensor<fp32, [1, 242, 4]> speaker_preds_out = cast(dtype = var_4766_cast_fp16_to_fp32_dtype_0, x = var_4766_cast_fp16)[name = tensor<string, []>("cast_301")];
3591
+ tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs_tmp = identity(x = chunk_pre_encoder_embs_out)[name = tensor<string, []>("chunk_pre_encoder_embs_tmp")];
3592
+ tensor<int32, [1]> chunk_pre_encoder_lengths_tmp = identity(x = chunk_pre_encoder_lengths_out)[name = tensor<string, []>("chunk_pre_encoder_lengths_tmp")];
3593
+ } -> (speaker_preds_out, chunk_pre_encoder_embs_out, chunk_pre_encoder_lengths_out);
3594
  }