alexwengg commited on
Commit
a2c20ae
·
verified ·
1 Parent(s): 5d80477

Upload 9 files

Browse files
README.md CHANGED
@@ -121,12 +121,13 @@ Slovak, Slovenian, Swedish, Turkish, Ukrainian, Vietnamese.
121
 
122
  ## Performance (Apple M2, macOS 26.5, FP16)
123
 
124
- | Module | Size | Predict | Compute placement |
125
- | ------------------ | ----- | ------- | ----------------- |
126
- | duration_predictor | 1.8 MB| 0.82 ms | CPU (tiny) |
127
- | text_encoder | 17 MB | 2.15 ms | 62 % ANE |
128
- | vocoder | 48 MB | 1.17 ms | 100 % ANE |
129
- | vector_estimator | 122 MB| 9.29 ms | CPU + GPU (see notes) |
 
130
 
131
  End‑to‑end on M2: ≈ 0.74 s to synthesize 6.32 s of audio for a single English
132
  sentence (RTFx ≈ 8.5×), 8 denoising steps. Output verified against
@@ -145,7 +146,8 @@ shipped — use `.mlmodelc` to skip the on‑device compile step on first load.
145
 
146
  - `TextEncoder.mlpackage` / `TextEncoder.mlmodelc` — fixed `T=128` text input.
147
  - `DurationPredictor.mlpackage` / `DurationPredictor.mlmodelc` — fixed `T=128` text input.
148
- - `VectorEstimator.mlpackage` / `VectorEstimator.mlmodelc` — `latent.L` and `text.T` as RangeDim(17..512).
 
149
  - `Vocoder.mlpackage` / `Vocoder.mlmodelc` — `latent.L_ttl` as RangeDim(4..512).
150
  - `tts.json` — token / text frontend configuration.
151
  - `unicode_indexer.json` — Unicode → token id mapping (multilingual frontend).
@@ -175,6 +177,9 @@ pip install -r requirements.txt
175
  python infer.py "Hello, world." --voice-style voice_styles/M1.json -o hello.wav
176
  python infer.py "Bonjour le monde." --lang fr --voice-style voice_styles/M1.json -o fr.wav
177
 
 
 
 
178
  # Optional: pick a compute unit explicitly.
179
  python infer.py "Test" --compute-units CPU_AND_NE -o ne.wav
180
  ```
 
121
 
122
  ## Performance (Apple M2, macOS 26.5, FP16)
123
 
124
+ | Module | Size | Predict | Compute placement |
125
+ | ----------------------- | ------ | ------- | ----------------- |
126
+ | duration_predictor | 1.8 MB | 0.82 ms | CPU (tiny) |
127
+ | text_encoder | 17 MB | 2.15 ms | 62 % ANE |
128
+ | vocoder | 48 MB | 1.17 ms | 100 % ANE |
129
+ | vector_estimator (fp16) | 122 MB | 9.29 ms | CPU + GPU (see notes) |
130
+ | vector_estimator (int8) | 62 MB | ~same | int8 weight-only / fp16 acts; ~10 % lower peak RSS, RMSE ≈ 0.016 vs FP16 |
131
 
132
  End‑to‑end on M2: ≈ 0.74 s to synthesize 6.32 s of audio for a single English
133
  sentence (RTFx ≈ 8.5×), 8 denoising steps. Output verified against
 
146
 
147
  - `TextEncoder.mlpackage` / `TextEncoder.mlmodelc` — fixed `T=128` text input.
148
  - `DurationPredictor.mlpackage` / `DurationPredictor.mlmodelc` — fixed `T=128` text input.
149
+ - `VectorEstimator.mlpackage` / `VectorEstimator.mlmodelc` — `latent.L` and `text.T` as RangeDim(17..512), FP16 weights (122 MB).
150
+ - `VectorEstimator_int8.mlpackage` / `VectorEstimator_int8.mlmodelc` — same model, **int8 weight-only** (per-channel symmetric) + FP16 activations (62 MB; ~10 % lower peak RSS, RMSE ≈ 0.016 vs FP16).
151
  - `Vocoder.mlpackage` / `Vocoder.mlmodelc` — `latent.L_ttl` as RangeDim(4..512).
152
  - `tts.json` — token / text frontend configuration.
153
  - `unicode_indexer.json` — Unicode → token id mapping (multilingual frontend).
 
177
  python infer.py "Hello, world." --voice-style voice_styles/M1.json -o hello.wav
178
  python infer.py "Bonjour le monde." --lang fr --voice-style voice_styles/M1.json -o fr.wav
179
 
180
+ # Use the int8-quantized VectorEstimator (62 MB instead of 122 MB).
181
+ python infer.py "Hello, int8 build." --vector-estimator VectorEstimator_int8.mlpackage -o int8.wav
182
+
183
  # Optional: pick a compute unit explicitly.
184
  python infer.py "Test" --compute-units CPU_AND_NE -o ne.wav
185
  ```
VectorEstimator_int8.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:982da75910aca2f5f0e2f813bf7db9ec201fbf4c71668d0f0a99eb2f9da983d0
3
+ size 243
VectorEstimator_int8.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e93e9a59f6d4035aa07cf21ff06e160eb1bbcdad9b48d2ac413d11f3c2c26949
3
+ size 633
VectorEstimator_int8.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
VectorEstimator_int8.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e637c3a1725b5e1b3df09cc162d1822e8fe80fa92ba01c180dde2ca29c12523
3
+ size 64184608
VectorEstimator_int8.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15d1760d354e6849c72ae2fd81805bf7121aab92567f12b0f230bf70268cc853
3
+ size 345736
VectorEstimator_int8.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e637c3a1725b5e1b3df09cc162d1822e8fe80fa92ba01c180dde2ca29c12523
3
+ size 64184608
VectorEstimator_int8.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "4F2FAD81-6BF2-4BFC-96C0-D8020F7800F8": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "56E32995-F792-4C6D-BE97-7345BD2B46DB": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "4F2FAD81-6BF2-4BFC-96C0-D8020F7800F8"
18
+ }
manifest.json CHANGED
@@ -43,11 +43,12 @@
43
  "modules": [
44
  {
45
  "name": "TextEncoder",
 
46
  "package": "TextEncoder.mlpackage",
47
  "compiled": "TextEncoder.mlmodelc",
48
  "shape_policy": "fixed T=128",
49
  "package_size_bytes": 18166801,
50
- "compiled_size_bytes": 18173642,
51
  "package_files": [
52
  {
53
  "path": "TextEncoder.mlpackage/Data/com.apple.CoreML/model.mlmodel",
@@ -66,6 +67,11 @@
66
  }
67
  ],
68
  "compiled_files": [
 
 
 
 
 
69
  {
70
  "path": "TextEncoder.mlmodelc/analytics/coremldata.bin",
71
  "sha256": "b39c208b46f180dcbf4cfcd3b4437eeb93371fb77d6d636b99ca3ddff5327253",
@@ -90,11 +96,12 @@
90
  },
91
  {
92
  "name": "DurationPredictor",
 
93
  "package": "DurationPredictor.mlpackage",
94
  "compiled": "DurationPredictor.mlmodelc",
95
  "shape_policy": "fixed T=128",
96
  "package_size_bytes": 1892063,
97
- "compiled_size_bytes": 1896415,
98
  "package_files": [
99
  {
100
  "path": "DurationPredictor.mlpackage/Data/com.apple.CoreML/model.mlmodel",
@@ -113,6 +120,11 @@
113
  }
114
  ],
115
  "compiled_files": [
 
 
 
 
 
116
  {
117
  "path": "DurationPredictor.mlmodelc/analytics/coremldata.bin",
118
  "sha256": "b836b72b388c2bc009a13bcc495cd08278a1742b1a866cdaf7faab7d80786bcb",
@@ -137,11 +149,12 @@
137
  },
138
  {
139
  "name": "VectorEstimator",
 
140
  "package": "VectorEstimator.mlpackage",
141
  "compiled": "VectorEstimator.mlmodelc",
142
  "shape_policy": "RangeDim L,T 17..512",
143
  "package_size_bytes": 127977306,
144
- "compiled_size_bytes": 127993906,
145
  "package_files": [
146
  {
147
  "path": "VectorEstimator.mlpackage/Data/com.apple.CoreML/model.mlmodel",
@@ -160,6 +173,11 @@
160
  }
161
  ],
162
  "compiled_files": [
 
 
 
 
 
163
  {
164
  "path": "VectorEstimator.mlmodelc/analytics/coremldata.bin",
165
  "sha256": "1f43626d16b133a8b302f4f7a0e6d0f789422b22df135b7af11df25c385bebe0",
@@ -182,13 +200,62 @@
182
  }
183
  ]
184
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  {
186
  "name": "Vocoder",
 
187
  "package": "Vocoder.mlpackage",
188
  "compiled": "Vocoder.mlmodelc",
189
  "shape_policy": "RangeDim L_ttl 4..512",
190
  "package_size_bytes": 50743824,
191
- "compiled_size_bytes": 50748769,
192
  "package_files": [
193
  {
194
  "path": "Vocoder.mlpackage/Data/com.apple.CoreML/model.mlmodel",
@@ -207,6 +274,11 @@
207
  }
208
  ],
209
  "compiled_files": [
 
 
 
 
 
210
  {
211
  "path": "Vocoder.mlmodelc/analytics/coremldata.bin",
212
  "sha256": "4fe1b825137629a96dc58a1339bc4ece32041b755f99d638f21a153f2e7faed6",
 
43
  "modules": [
44
  {
45
  "name": "TextEncoder",
46
+ "precision": "fp16",
47
  "package": "TextEncoder.mlpackage",
48
  "compiled": "TextEncoder.mlmodelc",
49
  "shape_policy": "fixed T=128",
50
  "package_size_bytes": 18166801,
51
+ "compiled_size_bytes": 18179790,
52
  "package_files": [
53
  {
54
  "path": "TextEncoder.mlpackage/Data/com.apple.CoreML/model.mlmodel",
 
67
  }
68
  ],
69
  "compiled_files": [
70
+ {
71
+ "path": "TextEncoder.mlmodelc/.DS_Store",
72
+ "sha256": "cf4645f055711856ff0569b29315fb462026365385312c81e7db25c789e76e2b",
73
+ "size_bytes": 6148
74
+ },
75
  {
76
  "path": "TextEncoder.mlmodelc/analytics/coremldata.bin",
77
  "sha256": "b39c208b46f180dcbf4cfcd3b4437eeb93371fb77d6d636b99ca3ddff5327253",
 
96
  },
97
  {
98
  "name": "DurationPredictor",
99
+ "precision": "fp16",
100
  "package": "DurationPredictor.mlpackage",
101
  "compiled": "DurationPredictor.mlmodelc",
102
  "shape_policy": "fixed T=128",
103
  "package_size_bytes": 1892063,
104
+ "compiled_size_bytes": 1902563,
105
  "package_files": [
106
  {
107
  "path": "DurationPredictor.mlpackage/Data/com.apple.CoreML/model.mlmodel",
 
120
  }
121
  ],
122
  "compiled_files": [
123
+ {
124
+ "path": "DurationPredictor.mlmodelc/.DS_Store",
125
+ "sha256": "75c79ac936a1add17210b7cbec4962505cc557bc2f940e276b1236d35ff12dfa",
126
+ "size_bytes": 6148
127
+ },
128
  {
129
  "path": "DurationPredictor.mlmodelc/analytics/coremldata.bin",
130
  "sha256": "b836b72b388c2bc009a13bcc495cd08278a1742b1a866cdaf7faab7d80786bcb",
 
149
  },
150
  {
151
  "name": "VectorEstimator",
152
+ "precision": "fp16",
153
  "package": "VectorEstimator.mlpackage",
154
  "compiled": "VectorEstimator.mlmodelc",
155
  "shape_policy": "RangeDim L,T 17..512",
156
  "package_size_bytes": 127977306,
157
+ "compiled_size_bytes": 128000054,
158
  "package_files": [
159
  {
160
  "path": "VectorEstimator.mlpackage/Data/com.apple.CoreML/model.mlmodel",
 
173
  }
174
  ],
175
  "compiled_files": [
176
+ {
177
+ "path": "VectorEstimator.mlmodelc/.DS_Store",
178
+ "sha256": "d61692ef37d205b073915a28b0fcca8eb5360563286dc61d44acbd2af6b4a184",
179
+ "size_bytes": 6148
180
+ },
181
  {
182
  "path": "VectorEstimator.mlmodelc/analytics/coremldata.bin",
183
  "sha256": "1f43626d16b133a8b302f4f7a0e6d0f789422b22df135b7af11df25c385bebe0",
 
200
  }
201
  ]
202
  },
203
+ {
204
+ "name": "VectorEstimator_int8",
205
+ "precision": "int8 weight-only / fp16 activations (per-channel symmetric)",
206
+ "package": "VectorEstimator_int8.mlpackage",
207
+ "compiled": "VectorEstimator_int8.mlmodelc",
208
+ "shape_policy": "RangeDim L,T 17..512",
209
+ "package_size_bytes": 64530961,
210
+ "compiled_size_bytes": 64551709,
211
+ "package_files": [
212
+ {
213
+ "path": "VectorEstimator_int8.mlpackage/Data/com.apple.CoreML/model.mlmodel",
214
+ "sha256": "15d1760d354e6849c72ae2fd81805bf7121aab92567f12b0f230bf70268cc853",
215
+ "size_bytes": 345736
216
+ },
217
+ {
218
+ "path": "VectorEstimator_int8.mlpackage/Data/com.apple.CoreML/weights/weight.bin",
219
+ "sha256": "0e637c3a1725b5e1b3df09cc162d1822e8fe80fa92ba01c180dde2ca29c12523",
220
+ "size_bytes": 64184608
221
+ },
222
+ {
223
+ "path": "VectorEstimator_int8.mlpackage/Manifest.json",
224
+ "sha256": "32b9a6e8860f4fe2a5f873042621b59aac94af0d1a62e184514b40a16d258897",
225
+ "size_bytes": 617
226
+ }
227
+ ],
228
+ "compiled_files": [
229
+ {
230
+ "path": "VectorEstimator_int8.mlmodelc/analytics/coremldata.bin",
231
+ "sha256": "982da75910aca2f5f0e2f813bf7db9ec201fbf4c71668d0f0a99eb2f9da983d0",
232
+ "size_bytes": 243
233
+ },
234
+ {
235
+ "path": "VectorEstimator_int8.mlmodelc/coremldata.bin",
236
+ "sha256": "e93e9a59f6d4035aa07cf21ff06e160eb1bbcdad9b48d2ac413d11f3c2c26949",
237
+ "size_bytes": 633
238
+ },
239
+ {
240
+ "path": "VectorEstimator_int8.mlmodelc/model.mil",
241
+ "sha256": "4286e9f3ea253bdbca50e246d5568e9f89e130e05b7c8243c54d70f3611e5440",
242
+ "size_bytes": 366225
243
+ },
244
+ {
245
+ "path": "VectorEstimator_int8.mlmodelc/weights/weight.bin",
246
+ "sha256": "0e637c3a1725b5e1b3df09cc162d1822e8fe80fa92ba01c180dde2ca29c12523",
247
+ "size_bytes": 64184608
248
+ }
249
+ ]
250
+ },
251
  {
252
  "name": "Vocoder",
253
+ "precision": "fp16",
254
  "package": "Vocoder.mlpackage",
255
  "compiled": "Vocoder.mlmodelc",
256
  "shape_policy": "RangeDim L_ttl 4..512",
257
  "package_size_bytes": 50743824,
258
+ "compiled_size_bytes": 50754917,
259
  "package_files": [
260
  {
261
  "path": "Vocoder.mlpackage/Data/com.apple.CoreML/model.mlmodel",
 
274
  }
275
  ],
276
  "compiled_files": [
277
+ {
278
+ "path": "Vocoder.mlmodelc/.DS_Store",
279
+ "sha256": "aad4800457fdb1567500acd0f9eb203b27c71d29189891095b9d371f8c9c58b5",
280
+ "size_bytes": 6148
281
+ },
282
  {
283
  "path": "Vocoder.mlmodelc/analytics/coremldata.bin",
284
  "sha256": "4fe1b825137629a96dc58a1339bc4ece32041b755f99d638f21a153f2e7faed6",