Upload folder using huggingface_hub
Browse files- README.md +0 -83
- Sortformer.mlmodelc/analytics/coremldata.bin +1 -1
- Sortformer.mlmodelc/coremldata.bin +2 -2
- Sortformer.mlmodelc/metadata.json +0 -176
- Sortformer.mlmodelc/model0/coremldata.bin +2 -2
- Sortformer.mlmodelc/model0/model.mil +6 -6
- Sortformer.mlmodelc/model1/coremldata.bin +2 -2
- Sortformer.mlmodelc/model1/model.mil +6 -6
README.md
DELETED
|
@@ -1,83 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: cc-by-4.0
|
| 3 |
-
tags:
|
| 4 |
-
- speaker-diarization
|
| 5 |
-
- coreml
|
| 6 |
-
- apple-silicon
|
| 7 |
-
- neural-engine
|
| 8 |
-
- sortformer
|
| 9 |
-
datasets:
|
| 10 |
-
- voxconverse
|
| 11 |
-
language:
|
| 12 |
-
- en
|
| 13 |
-
pipeline_tag: audio-classification
|
| 14 |
-
---
|
| 15 |
-
|
| 16 |
-
# Sortformer Diarization (CoreML)
|
| 17 |
-
|
| 18 |
-
CoreML-converted [NVIDIA Sortformer](https://arxiv.org/abs/2409.06656) model for end-to-end speaker diarization on Apple Silicon.
|
| 19 |
-
|
| 20 |
-
Runs on Neural Engine at ~120x real-time.
|
| 21 |
-
|
| 22 |
-
## Model Details
|
| 23 |
-
|
| 24 |
-
- **Architecture**: Sortformer (Sort Loss + FastConformer-based)
|
| 25 |
-
- **Task**: Speaker diarization (up to 4 speakers)
|
| 26 |
-
- **Input**: 128-dim log-mel features
|
| 27 |
-
- **Output**: Per-frame speaker activity probabilities
|
| 28 |
-
- **Format**: CoreML `.mlmodelc` (compiled, ready to load)
|
| 29 |
-
- **Size**: ~240 MB
|
| 30 |
-
|
| 31 |
-
## Streaming Configuration
|
| 32 |
-
|
| 33 |
-
| Parameter | Value |
|
| 34 |
-
|-----------|-------|
|
| 35 |
-
| Sample rate | 16 kHz |
|
| 36 |
-
| Mel bins | 128 |
|
| 37 |
-
| n_fft | 400 |
|
| 38 |
-
| Hop length | 160 |
|
| 39 |
-
| Chunk length | 6s |
|
| 40 |
-
| Left context | 1 chunk |
|
| 41 |
-
| Right context | 7 chunks |
|
| 42 |
-
| Subsampling factor | 8 |
|
| 43 |
-
| Max speakers | 4 |
|
| 44 |
-
|
| 45 |
-
## Input/Output Shapes
|
| 46 |
-
|
| 47 |
-
| Tensor | Shape | Description |
|
| 48 |
-
|--------|-------|-------------|
|
| 49 |
-
| `chunk` | `[1, 112, 128]` | Mel features for current chunk |
|
| 50 |
-
| `chunk_lengths` | `[1]` | Valid frames in chunk |
|
| 51 |
-
| `spkcache` | `[1, 188, 512]` | Speaker cache state |
|
| 52 |
-
| `spkcache_lengths` | `[1]` | Valid entries in speaker cache |
|
| 53 |
-
| `fifo` | `[1, 40, 512]` | FIFO buffer state |
|
| 54 |
-
| `fifo_lengths` | `[1]` | Valid entries in FIFO |
|
| 55 |
-
| **Output** | | |
|
| 56 |
-
| `speaker_preds` | `[242, 4]` | Speaker activity probabilities |
|
| 57 |
-
| `chunk_pre_encoder_embs` | `[94, 512]` | Updated embeddings |
|
| 58 |
-
| `out_spkcache_lengths` | `[1]` | Updated speaker cache length |
|
| 59 |
-
| `out_fifo_lengths` | `[1]` | Updated FIFO length |
|
| 60 |
-
|
| 61 |
-
## Usage
|
| 62 |
-
|
| 63 |
-
Used by [speech-swift](https://github.com/soniqo/speech-swift) for speaker diarization:
|
| 64 |
-
|
| 65 |
-
```bash
|
| 66 |
-
audio diarize meeting.wav --engine sortformer
|
| 67 |
-
```
|
| 68 |
-
|
| 69 |
-
```swift
|
| 70 |
-
let diarizer = try await SortformerDiarizer.fromPretrained()
|
| 71 |
-
let result = try diarizer.diarize(audio: samples, sampleRate: 16000)
|
| 72 |
-
for segment in result.segments {
|
| 73 |
-
print("Speaker \(segment.speakerId): \(segment.startTime)s - \(segment.endTime)s")
|
| 74 |
-
}
|
| 75 |
-
```
|
| 76 |
-
|
| 77 |
-
## Original Model
|
| 78 |
-
|
| 79 |
-
Converted from [NVIDIA NeMo Sortformer](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/speaker_diarization/configs.html) via coremltools.
|
| 80 |
-
|
| 81 |
-
## License
|
| 82 |
-
|
| 83 |
-
CC-BY-4.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Sortformer.mlmodelc/analytics/coremldata.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 202
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e1a18b8f51199d7598c0728aeb6ef5ca344aa2e52880033fac32a795e5b76539
|
| 3 |
size 202
|
Sortformer.mlmodelc/coremldata.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:29f0563d086d7ea0b8f0461d6e1330d475971fdade05809a52207abf739d9992
|
| 3 |
+
size 1093
|
Sortformer.mlmodelc/metadata.json
DELETED
|
@@ -1,176 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"metadataOutputVersion" : "3.0",
|
| 4 |
-
"shortDescription" : "CoreML port of Nvidia's Streaming Sortformer diarization model",
|
| 5 |
-
"outputSchema" : [
|
| 6 |
-
{
|
| 7 |
-
"hasShapeFlexibility" : "0",
|
| 8 |
-
"isOptional" : "0",
|
| 9 |
-
"dataType" : "Float32",
|
| 10 |
-
"formattedType" : "MultiArray (Float32 1 × 242 × 4)",
|
| 11 |
-
"shortDescription" : "Combined speaker probabilities for the speaker cache, FIFO queue, and chunk",
|
| 12 |
-
"shape" : "[1, 242, 4]",
|
| 13 |
-
"name" : "speaker_preds",
|
| 14 |
-
"type" : "MultiArray"
|
| 15 |
-
},
|
| 16 |
-
{
|
| 17 |
-
"hasShapeFlexibility" : "0",
|
| 18 |
-
"isOptional" : "0",
|
| 19 |
-
"dataType" : "Float32",
|
| 20 |
-
"formattedType" : "MultiArray (Float32 1 × 14 × 512)",
|
| 21 |
-
"shortDescription" : "Speaker embeddings for the new chunk",
|
| 22 |
-
"shape" : "[1, 14, 512]",
|
| 23 |
-
"name" : "chunk_pre_encoder_embs",
|
| 24 |
-
"type" : "MultiArray"
|
| 25 |
-
},
|
| 26 |
-
{
|
| 27 |
-
"hasShapeFlexibility" : "0",
|
| 28 |
-
"isOptional" : "0",
|
| 29 |
-
"dataType" : "Int32",
|
| 30 |
-
"formattedType" : "MultiArray (Int32 1)",
|
| 31 |
-
"shortDescription" : "Number of frames for the new chunk",
|
| 32 |
-
"shape" : "[1]",
|
| 33 |
-
"name" : "chunk_pre_encoder_lengths",
|
| 34 |
-
"type" : "MultiArray"
|
| 35 |
-
}
|
| 36 |
-
],
|
| 37 |
-
"storagePrecision" : "Mixed (Float16, Float32)",
|
| 38 |
-
"modelParameters" : [
|
| 39 |
-
|
| 40 |
-
],
|
| 41 |
-
"author" : "Benjamin Lee",
|
| 42 |
-
"specificationVersion" : 7,
|
| 43 |
-
"license" : "MIT",
|
| 44 |
-
"mlProgramOperationTypeHistogram" : {
|
| 45 |
-
"Ios16.floorDiv" : 3,
|
| 46 |
-
"Transpose" : 193,
|
| 47 |
-
"Identity" : 2,
|
| 48 |
-
"Ios16.softmax" : 35,
|
| 49 |
-
"Ios16.gatherAlongAxis" : 1,
|
| 50 |
-
"Split" : 17,
|
| 51 |
-
"Ios16.linear" : 248,
|
| 52 |
-
"Ios16.add" : 186,
|
| 53 |
-
"Concat" : 1,
|
| 54 |
-
"Ios16.greaterEqual" : 2,
|
| 55 |
-
"Tile" : 9,
|
| 56 |
-
"Select" : 51,
|
| 57 |
-
"Ios16.minimum" : 1,
|
| 58 |
-
"Ios16.sigmoid" : 18,
|
| 59 |
-
"Ios16.logicalAnd" : 2,
|
| 60 |
-
"Pad" : 34,
|
| 61 |
-
"ExpandDims" : 25,
|
| 62 |
-
"Ios16.sub" : 6,
|
| 63 |
-
"Ios16.cast" : 16,
|
| 64 |
-
"Ios16.less" : 7,
|
| 65 |
-
"Ios16.conv" : 56,
|
| 66 |
-
"Ios16.relu" : 23,
|
| 67 |
-
"Ios16.reshape" : 175,
|
| 68 |
-
"Ios16.matmul" : 87,
|
| 69 |
-
"Ios16.maximum" : 1,
|
| 70 |
-
"Ios16.layerNorm" : 121,
|
| 71 |
-
"SliceByIndex" : 34,
|
| 72 |
-
"Ios16.silu" : 51,
|
| 73 |
-
"Ios16.mul" : 119,
|
| 74 |
-
"Ios16.logicalNot" : 2
|
| 75 |
-
},
|
| 76 |
-
"computePrecision" : "Mixed (Float16, Float32, Int32)",
|
| 77 |
-
"stateSchema" : [
|
| 78 |
-
|
| 79 |
-
],
|
| 80 |
-
"isUpdatable" : "0",
|
| 81 |
-
"availability" : {
|
| 82 |
-
"macOS" : "13.0",
|
| 83 |
-
"tvOS" : "16.0",
|
| 84 |
-
"visionOS" : "1.0",
|
| 85 |
-
"watchOS" : "9.0",
|
| 86 |
-
"iOS" : "16.0",
|
| 87 |
-
"macCatalyst" : "16.0"
|
| 88 |
-
},
|
| 89 |
-
"modelType" : {
|
| 90 |
-
"name" : "MLModelType_pipeline",
|
| 91 |
-
"structure" : [
|
| 92 |
-
{
|
| 93 |
-
"name" : "MLModelType_mlProgram"
|
| 94 |
-
},
|
| 95 |
-
{
|
| 96 |
-
"name" : "MLModelType_mlProgram"
|
| 97 |
-
}
|
| 98 |
-
]
|
| 99 |
-
},
|
| 100 |
-
"inputSchema" : [
|
| 101 |
-
{
|
| 102 |
-
"hasShapeFlexibility" : "0",
|
| 103 |
-
"isOptional" : "0",
|
| 104 |
-
"dataType" : "Float32",
|
| 105 |
-
"formattedType" : "MultiArray (Float32 1 × 112 × 128)",
|
| 106 |
-
"shortDescription" : "Mel spectrogram features for the new chunk",
|
| 107 |
-
"shape" : "[1, 112, 128]",
|
| 108 |
-
"name" : "chunk",
|
| 109 |
-
"type" : "MultiArray"
|
| 110 |
-
},
|
| 111 |
-
{
|
| 112 |
-
"hasShapeFlexibility" : "0",
|
| 113 |
-
"isOptional" : "0",
|
| 114 |
-
"dataType" : "Int32",
|
| 115 |
-
"formattedType" : "MultiArray (Int32 1)",
|
| 116 |
-
"shortDescription" : "Length of the new chunk",
|
| 117 |
-
"shape" : "[1]",
|
| 118 |
-
"name" : "chunk_lengths",
|
| 119 |
-
"type" : "MultiArray"
|
| 120 |
-
},
|
| 121 |
-
{
|
| 122 |
-
"hasShapeFlexibility" : "0",
|
| 123 |
-
"isOptional" : "0",
|
| 124 |
-
"dataType" : "Float32",
|
| 125 |
-
"formattedType" : "MultiArray (Float32 1 × 188 × 512)",
|
| 126 |
-
"shortDescription" : "Order of Arrival Speaker Cache",
|
| 127 |
-
"shape" : "[1, 188, 512]",
|
| 128 |
-
"name" : "spkcache",
|
| 129 |
-
"type" : "MultiArray"
|
| 130 |
-
},
|
| 131 |
-
{
|
| 132 |
-
"hasShapeFlexibility" : "0",
|
| 133 |
-
"isOptional" : "0",
|
| 134 |
-
"dataType" : "Int32",
|
| 135 |
-
"formattedType" : "MultiArray (Int32 1)",
|
| 136 |
-
"shortDescription" : "Length of the speaker cache (in frames)",
|
| 137 |
-
"shape" : "[1]",
|
| 138 |
-
"name" : "spkcache_lengths",
|
| 139 |
-
"type" : "MultiArray"
|
| 140 |
-
},
|
| 141 |
-
{
|
| 142 |
-
"hasShapeFlexibility" : "0",
|
| 143 |
-
"isOptional" : "0",
|
| 144 |
-
"dataType" : "Float32",
|
| 145 |
-
"formattedType" : "MultiArray (Float32 1 × 40 × 512)",
|
| 146 |
-
"shortDescription" : "First-In-First-Out speech queue",
|
| 147 |
-
"shape" : "[1, 40, 512]",
|
| 148 |
-
"name" : "fifo",
|
| 149 |
-
"type" : "MultiArray"
|
| 150 |
-
},
|
| 151 |
-
{
|
| 152 |
-
"hasShapeFlexibility" : "0",
|
| 153 |
-
"isOptional" : "0",
|
| 154 |
-
"dataType" : "Int32",
|
| 155 |
-
"formattedType" : "MultiArray (Int32 1)",
|
| 156 |
-
"shortDescription" : "Length of the FIFO queue (in frames)",
|
| 157 |
-
"shape" : "[1]",
|
| 158 |
-
"name" : "fifo_lengths",
|
| 159 |
-
"type" : "MultiArray"
|
| 160 |
-
}
|
| 161 |
-
],
|
| 162 |
-
"userDefinedMetadata" : {
|
| 163 |
-
"frame_duration" : "0.08",
|
| 164 |
-
"spkcache_update_period" : "31",
|
| 165 |
-
"chunk_len" : "6",
|
| 166 |
-
"subsampling_factor" : "8",
|
| 167 |
-
"chunk_right_context" : "7",
|
| 168 |
-
"mel_feature_frames" : "48",
|
| 169 |
-
"fifo_len" : "40",
|
| 170 |
-
"chunk_left_context" : "1",
|
| 171 |
-
"spkcache_len" : "188"
|
| 172 |
-
},
|
| 173 |
-
"generatedClassName" : "Sortformer",
|
| 174 |
-
"method" : "predict"
|
| 175 |
-
}
|
| 176 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Sortformer.mlmodelc/model0/coremldata.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1221dd5d8951643b2226e5d977d6400316d61c752cfd4b7c673646c29be386e4
|
| 3 |
+
size 640
|
Sortformer.mlmodelc/model0/model.mil
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
program(1.0)
|
| 2 |
-
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "
|
| 3 |
{
|
| 4 |
func main<ios16>(tensor<fp32, [1, 112, 128]> chunk, tensor<int32, [1]> chunk_lengths, tensor<fp32, [1, 40, 512]> fifo, tensor<int32, [1]> fifo_lengths, tensor<fp32, [1, 188, 512]> spkcache, tensor<int32, [1]> spkcache_lengths) {
|
| 5 |
tensor<fp32, [256]> model_encoder_pre_encode_conv_0_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/0-weight.bin"), offset = tensor<uint64, []>(64)))];
|
|
@@ -154,16 +154,16 @@ program(1.0)
|
|
| 154 |
tensor<int32, [3]> var_229 = const()[name = tensor<string, []>("op_229"), val = tensor<int32, [3]>([1, 14, -1])];
|
| 155 |
tensor<fp32, [1, 14, 256, 16]> var_228 = transpose(perm = var_228_perm_0, x = x)[name = tensor<string, []>("transpose_0")];
|
| 156 |
tensor<fp32, [1, 14, 4096]> input = reshape(shape = var_229, x = var_228)[name = tensor<string, []>("input")];
|
| 157 |
-
tensor<fp32, [1, 14, 512]>
|
| 158 |
tensor<string, []> var_241_dtype_0 = const()[name = tensor<string, []>("op_241_dtype_0"), val = tensor<string, []>("int32")];
|
| 159 |
tensor<int32, [1]> size0 = const()[name = tensor<string, []>("size0"), val = tensor<int32, [1]>([188])];
|
| 160 |
tensor<int32, [1]> size1 = const()[name = tensor<string, []>("size1"), val = tensor<int32, [1]>([40])];
|
| 161 |
tensor<int32, []> var_264 = const()[name = tensor<string, []>("op_264"), val = tensor<int32, []>(1)];
|
| 162 |
tensor<bool, []> full_concat_interleave_0 = const()[name = tensor<string, []>("full_concat_interleave_0"), val = tensor<bool, []>(false)];
|
| 163 |
-
tensor<fp32, [1, 242, 512]> full_concat = concat(axis = var_264, interleave = full_concat_interleave_0, values = (spkcache, fifo,
|
| 164 |
tensor<int32, [1]> var_273 = add(x = spkcache_lengths, y = fifo_lengths)[name = tensor<string, []>("op_273")];
|
| 165 |
-
tensor<int32, [1]>
|
| 166 |
-
tensor<int32, [1]> pre_encoder_lengths = add(x = var_273, y =
|
| 167 |
tensor<int32, [242]> out_pos = const()[name = tensor<string, []>("out_pos"), val = tensor<int32, [242]>([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241])];
|
| 168 |
tensor<bool, [242]> var_284 = greater_equal(x = out_pos, y = spkcache_lengths)[name = tensor<string, []>("op_284")];
|
| 169 |
tensor<string, []> in_seg1_or_2_dtype_0 = const()[name = tensor<string, []>("in_seg1_or_2_dtype_0"), val = tensor<string, []>("int32")];
|
|
@@ -197,5 +197,5 @@ program(1.0)
|
|
| 197 |
tensor<int32, [1]> var_332_axes_0 = const()[name = tensor<string, []>("op_332_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 198 |
tensor<fp32, [1, 242, 1]> var_332 = expand_dims(axes = var_332_axes_0, x = var_330)[name = tensor<string, []>("op_332")];
|
| 199 |
tensor<fp32, [1, 242, 512]> pre_encoder_embs = mul(x = output, y = var_332)[name = tensor<string, []>("op_333")];
|
| 200 |
-
} -> (pre_encoder_embs, pre_encoder_lengths,
|
| 201 |
}
|
|
|
|
| 1 |
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3405.2.1"}, {"coremlc-version", "3405.2.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
|
| 3 |
{
|
| 4 |
func main<ios16>(tensor<fp32, [1, 112, 128]> chunk, tensor<int32, [1]> chunk_lengths, tensor<fp32, [1, 40, 512]> fifo, tensor<int32, [1]> fifo_lengths, tensor<fp32, [1, 188, 512]> spkcache, tensor<int32, [1]> spkcache_lengths) {
|
| 5 |
tensor<fp32, [256]> model_encoder_pre_encode_conv_0_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/0-weight.bin"), offset = tensor<uint64, []>(64)))];
|
|
|
|
| 154 |
tensor<int32, [3]> var_229 = const()[name = tensor<string, []>("op_229"), val = tensor<int32, [3]>([1, 14, -1])];
|
| 155 |
tensor<fp32, [1, 14, 256, 16]> var_228 = transpose(perm = var_228_perm_0, x = x)[name = tensor<string, []>("transpose_0")];
|
| 156 |
tensor<fp32, [1, 14, 4096]> input = reshape(shape = var_229, x = var_228)[name = tensor<string, []>("input")];
|
| 157 |
+
tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs_out = linear(bias = model_encoder_pre_encode_out_bias, weight = model_encoder_pre_encode_out_weight, x = input)[name = tensor<string, []>("linear_0")];
|
| 158 |
tensor<string, []> var_241_dtype_0 = const()[name = tensor<string, []>("op_241_dtype_0"), val = tensor<string, []>("int32")];
|
| 159 |
tensor<int32, [1]> size0 = const()[name = tensor<string, []>("size0"), val = tensor<int32, [1]>([188])];
|
| 160 |
tensor<int32, [1]> size1 = const()[name = tensor<string, []>("size1"), val = tensor<int32, [1]>([40])];
|
| 161 |
tensor<int32, []> var_264 = const()[name = tensor<string, []>("op_264"), val = tensor<int32, []>(1)];
|
| 162 |
tensor<bool, []> full_concat_interleave_0 = const()[name = tensor<string, []>("full_concat_interleave_0"), val = tensor<bool, []>(false)];
|
| 163 |
+
tensor<fp32, [1, 242, 512]> full_concat = concat(axis = var_264, interleave = full_concat_interleave_0, values = (spkcache, fifo, chunk_pre_encoder_embs_out))[name = tensor<string, []>("full_concat")];
|
| 164 |
tensor<int32, [1]> var_273 = add(x = spkcache_lengths, y = fifo_lengths)[name = tensor<string, []>("op_273")];
|
| 165 |
+
tensor<int32, [1]> chunk_pre_encoder_lengths_out = cast(dtype = var_241_dtype_0, x = current_lengths)[name = tensor<string, []>("cast_4")];
|
| 166 |
+
tensor<int32, [1]> pre_encoder_lengths = add(x = var_273, y = chunk_pre_encoder_lengths_out)[name = tensor<string, []>("total_length")];
|
| 167 |
tensor<int32, [242]> out_pos = const()[name = tensor<string, []>("out_pos"), val = tensor<int32, [242]>([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241])];
|
| 168 |
tensor<bool, [242]> var_284 = greater_equal(x = out_pos, y = spkcache_lengths)[name = tensor<string, []>("op_284")];
|
| 169 |
tensor<string, []> in_seg1_or_2_dtype_0 = const()[name = tensor<string, []>("in_seg1_or_2_dtype_0"), val = tensor<string, []>("int32")];
|
|
|
|
| 197 |
tensor<int32, [1]> var_332_axes_0 = const()[name = tensor<string, []>("op_332_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 198 |
tensor<fp32, [1, 242, 1]> var_332 = expand_dims(axes = var_332_axes_0, x = var_330)[name = tensor<string, []>("op_332")];
|
| 199 |
tensor<fp32, [1, 242, 512]> pre_encoder_embs = mul(x = output, y = var_332)[name = tensor<string, []>("op_333")];
|
| 200 |
+
} -> (pre_encoder_embs, pre_encoder_lengths, chunk_pre_encoder_embs_out, chunk_pre_encoder_lengths_out);
|
| 201 |
}
|
Sortformer.mlmodelc/model1/coremldata.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:efacb38511f55c820a24313fb191341db87f05466ec6dd855203d8f6c126aba3
|
| 3 |
+
size 605
|
Sortformer.mlmodelc/model1/model.mil
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
program(1.0)
|
| 2 |
-
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "
|
| 3 |
{
|
| 4 |
-
func main<ios16>(tensor<fp32, [1, 14, 512]>
|
| 5 |
tensor<int32, []> var_30 = const()[name = tensor<string, []>("op_30"), val = tensor<int32, []>(-1)];
|
| 6 |
tensor<string, []> pre_encoder_embs_to_fp16_dtype_0 = const()[name = tensor<string, []>("pre_encoder_embs_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 7 |
tensor<fp16, []> var_77_to_fp16 = const()[name = tensor<string, []>("op_77_to_fp16"), val = tensor<fp16, []>(0x1.6ap+4)];
|
|
@@ -3587,8 +3587,8 @@ program(1.0)
|
|
| 3587 |
tensor<fp16, [1, 242, 1]> var_4765_to_fp16 = cast(dtype = var_4765_promoted_to_fp16_dtype_0, x = var_4765)[name = tensor<string, []>("cast_302")];
|
| 3588 |
tensor<fp16, [1, 242, 4]> var_4766_cast_fp16 = mul(x = _preds_cast_fp16, y = var_4765_to_fp16)[name = tensor<string, []>("op_4766_cast_fp16")];
|
| 3589 |
tensor<string, []> var_4766_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_4766_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 3590 |
-
tensor<fp32, [1, 242, 4]>
|
| 3591 |
-
tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs_tmp = identity(x =
|
| 3592 |
-
tensor<int32, [1]> chunk_pre_encoder_lengths_tmp = identity(x =
|
| 3593 |
-
} -> (
|
| 3594 |
}
|
|
|
|
| 1 |
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3405.2.1"}, {"coremlc-version", "3405.2.1"}, {"coremltools-component-torch", "2.9.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
|
| 3 |
{
|
| 4 |
+
func main<ios16>(tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs_out, tensor<int32, [1]> chunk_pre_encoder_lengths_out, tensor<fp32, [1, 242, 512]> pre_encoder_embs, tensor<int32, [1]> pre_encoder_lengths) {
|
| 5 |
tensor<int32, []> var_30 = const()[name = tensor<string, []>("op_30"), val = tensor<int32, []>(-1)];
|
| 6 |
tensor<string, []> pre_encoder_embs_to_fp16_dtype_0 = const()[name = tensor<string, []>("pre_encoder_embs_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 7 |
tensor<fp16, []> var_77_to_fp16 = const()[name = tensor<string, []>("op_77_to_fp16"), val = tensor<fp16, []>(0x1.6ap+4)];
|
|
|
|
| 3587 |
tensor<fp16, [1, 242, 1]> var_4765_to_fp16 = cast(dtype = var_4765_promoted_to_fp16_dtype_0, x = var_4765)[name = tensor<string, []>("cast_302")];
|
| 3588 |
tensor<fp16, [1, 242, 4]> var_4766_cast_fp16 = mul(x = _preds_cast_fp16, y = var_4765_to_fp16)[name = tensor<string, []>("op_4766_cast_fp16")];
|
| 3589 |
tensor<string, []> var_4766_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_4766_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 3590 |
+
tensor<fp32, [1, 242, 4]> speaker_preds_out = cast(dtype = var_4766_cast_fp16_to_fp32_dtype_0, x = var_4766_cast_fp16)[name = tensor<string, []>("cast_301")];
|
| 3591 |
+
tensor<fp32, [1, 14, 512]> chunk_pre_encoder_embs_tmp = identity(x = chunk_pre_encoder_embs_out)[name = tensor<string, []>("chunk_pre_encoder_embs_tmp")];
|
| 3592 |
+
tensor<int32, [1]> chunk_pre_encoder_lengths_tmp = identity(x = chunk_pre_encoder_lengths_out)[name = tensor<string, []>("chunk_pre_encoder_lengths_tmp")];
|
| 3593 |
+
} -> (speaker_preds_out, chunk_pre_encoder_embs_out, chunk_pre_encoder_lengths_out);
|
| 3594 |
}
|