Upload 9 files

Browse files

Files changed (9) hide show

README.md +12 -7
VectorEstimator_int8.mlmodelc/analytics/coremldata.bin +3 -0
VectorEstimator_int8.mlmodelc/coremldata.bin +3 -0
VectorEstimator_int8.mlmodelc/model.mil +0 -0
VectorEstimator_int8.mlmodelc/weights/weight.bin +3 -0
VectorEstimator_int8.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
VectorEstimator_int8.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
VectorEstimator_int8.mlpackage/Manifest.json +18 -0
manifest.json +76 -4

README.md CHANGED Viewed

@@ -121,12 +121,13 @@ Slovak, Slovenian, Swedish, Turkish, Ukrainian, Vietnamese.
 ## Performance (Apple M2, macOS 26.5, FP16)
-| Module             | Size  | Predict | Compute placement |
-| ------------------ | ----- | ------- | ----------------- |
-| duration_predictor | 1.8 MB| 0.82 ms | CPU (tiny)        |
-| text_encoder       | 17 MB | 2.15 ms | 62 % ANE          |
-| vocoder            | 48 MB | 1.17 ms | 100 % ANE         |
-| vector_estimator   | 122 MB| 9.29 ms | CPU + GPU (see notes) |
 End‑to‑end on M2: ≈ 0.74 s to synthesize 6.32 s of audio for a single English
 sentence (RTFx ≈ 8.5×), 8 denoising steps. Output verified against
@@ -145,7 +146,8 @@ shipped — use `.mlmodelc` to skip the on‑device compile step on first load.
 - `TextEncoder.mlpackage` / `TextEncoder.mlmodelc`               — fixed `T=128` text input.
 - `DurationPredictor.mlpackage` / `DurationPredictor.mlmodelc`   — fixed `T=128` text input.
-- `VectorEstimator.mlpackage` / `VectorEstimator.mlmodelc`       — `latent.L` and `text.T` as RangeDim(17..512).
 - `Vocoder.mlpackage` / `Vocoder.mlmodelc`                       — `latent.L_ttl` as RangeDim(4..512).
 - `tts.json`                     — token / text frontend configuration.
 - `unicode_indexer.json`         — Unicode → token id mapping (multilingual frontend).
@@ -175,6 +177,9 @@ pip install -r requirements.txt
 python infer.py "Hello, world." --voice-style voice_styles/M1.json -o hello.wav
 python infer.py "Bonjour le monde." --lang fr --voice-style voice_styles/M1.json -o fr.wav
 # Optional: pick a compute unit explicitly.
 python infer.py "Test" --compute-units CPU_AND_NE -o ne.wav
 ```

 ## Performance (Apple M2, macOS 26.5, FP16)
+| Module                  | Size   | Predict | Compute placement |
+| ----------------------- | ------ | ------- | ----------------- |
+| duration_predictor      | 1.8 MB | 0.82 ms | CPU (tiny)        |
+| text_encoder            | 17 MB  | 2.15 ms | 62 % ANE          |
+| vocoder                 | 48 MB  | 1.17 ms | 100 % ANE         |
+| vector_estimator (fp16) | 122 MB | 9.29 ms | CPU + GPU (see notes) |
+| vector_estimator (int8) | 62 MB  | ~same   | int8 weight-only / fp16 acts; ~10 % lower peak RSS, RMSE ≈ 0.016 vs FP16 |
 End‑to‑end on M2: ≈ 0.74 s to synthesize 6.32 s of audio for a single English
 sentence (RTFx ≈ 8.5×), 8 denoising steps. Output verified against
 - `TextEncoder.mlpackage` / `TextEncoder.mlmodelc`               — fixed `T=128` text input.
 - `DurationPredictor.mlpackage` / `DurationPredictor.mlmodelc`   — fixed `T=128` text input.
+- `VectorEstimator.mlpackage` / `VectorEstimator.mlmodelc`       — `latent.L` and `text.T` as RangeDim(17..512), FP16 weights (122 MB).
+- `VectorEstimator_int8.mlpackage` / `VectorEstimator_int8.mlmodelc` — same model, **int8 weight-only** (per-channel symmetric) + FP16 activations (62 MB; ~10 % lower peak RSS, RMSE ≈ 0.016 vs FP16).
 - `Vocoder.mlpackage` / `Vocoder.mlmodelc`                       — `latent.L_ttl` as RangeDim(4..512).
 - `tts.json`                     — token / text frontend configuration.
 - `unicode_indexer.json`         — Unicode → token id mapping (multilingual frontend).
 python infer.py "Hello, world." --voice-style voice_styles/M1.json -o hello.wav
 python infer.py "Bonjour le monde." --lang fr --voice-style voice_styles/M1.json -o fr.wav
+# Use the int8-quantized VectorEstimator (62 MB instead of 122 MB).
+python infer.py "Hello, int8 build." --vector-estimator VectorEstimator_int8.mlpackage -o int8.wav
 # Optional: pick a compute unit explicitly.
 python infer.py "Test" --compute-units CPU_AND_NE -o ne.wav
 ```

VectorEstimator_int8.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:982da75910aca2f5f0e2f813bf7db9ec201fbf4c71668d0f0a99eb2f9da983d0
+size 243

VectorEstimator_int8.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e93e9a59f6d4035aa07cf21ff06e160eb1bbcdad9b48d2ac413d11f3c2c26949
+size 633

VectorEstimator_int8.mlmodelc/model.mil ADDED Viewed

The diff for this file is too large to render. See raw diff

VectorEstimator_int8.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e637c3a1725b5e1b3df09cc162d1822e8fe80fa92ba01c180dde2ca29c12523
+size 64184608

VectorEstimator_int8.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15d1760d354e6849c72ae2fd81805bf7121aab92567f12b0f230bf70268cc853
+size 345736

VectorEstimator_int8.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e637c3a1725b5e1b3df09cc162d1822e8fe80fa92ba01c180dde2ca29c12523
+size 64184608

VectorEstimator_int8.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "4F2FAD81-6BF2-4BFC-96C0-D8020F7800F8": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "56E32995-F792-4C6D-BE97-7345BD2B46DB": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        }
+    },
+    "rootModelIdentifier": "4F2FAD81-6BF2-4BFC-96C0-D8020F7800F8"
+}

manifest.json CHANGED Viewed

@@ -43,11 +43,12 @@
   "modules": [
     {
       "name": "TextEncoder",
       "package": "TextEncoder.mlpackage",
       "compiled": "TextEncoder.mlmodelc",
       "shape_policy": "fixed T=128",
       "package_size_bytes": 18166801,
-      "compiled_size_bytes": 18173642,
       "package_files": [
         {
           "path": "TextEncoder.mlpackage/Data/com.apple.CoreML/model.mlmodel",
@@ -66,6 +67,11 @@
         }
       ],
       "compiled_files": [
         {
           "path": "TextEncoder.mlmodelc/analytics/coremldata.bin",
           "sha256": "b39c208b46f180dcbf4cfcd3b4437eeb93371fb77d6d636b99ca3ddff5327253",
@@ -90,11 +96,12 @@
     },
     {
       "name": "DurationPredictor",
       "package": "DurationPredictor.mlpackage",
       "compiled": "DurationPredictor.mlmodelc",
       "shape_policy": "fixed T=128",
       "package_size_bytes": 1892063,
-      "compiled_size_bytes": 1896415,
       "package_files": [
         {
           "path": "DurationPredictor.mlpackage/Data/com.apple.CoreML/model.mlmodel",
@@ -113,6 +120,11 @@
         }
       ],
       "compiled_files": [
         {
           "path": "DurationPredictor.mlmodelc/analytics/coremldata.bin",
           "sha256": "b836b72b388c2bc009a13bcc495cd08278a1742b1a866cdaf7faab7d80786bcb",
@@ -137,11 +149,12 @@
     },
     {
       "name": "VectorEstimator",
       "package": "VectorEstimator.mlpackage",
       "compiled": "VectorEstimator.mlmodelc",
       "shape_policy": "RangeDim L,T 17..512",
       "package_size_bytes": 127977306,
-      "compiled_size_bytes": 127993906,
       "package_files": [
         {
           "path": "VectorEstimator.mlpackage/Data/com.apple.CoreML/model.mlmodel",
@@ -160,6 +173,11 @@
         }
       ],
       "compiled_files": [
         {
           "path": "VectorEstimator.mlmodelc/analytics/coremldata.bin",
           "sha256": "1f43626d16b133a8b302f4f7a0e6d0f789422b22df135b7af11df25c385bebe0",
@@ -182,13 +200,62 @@
         }
       ]
     },
     {
       "name": "Vocoder",
       "package": "Vocoder.mlpackage",
       "compiled": "Vocoder.mlmodelc",
       "shape_policy": "RangeDim L_ttl 4..512",
       "package_size_bytes": 50743824,
-      "compiled_size_bytes": 50748769,
       "package_files": [
         {
           "path": "Vocoder.mlpackage/Data/com.apple.CoreML/model.mlmodel",
@@ -207,6 +274,11 @@
         }
       ],
       "compiled_files": [
         {
           "path": "Vocoder.mlmodelc/analytics/coremldata.bin",
           "sha256": "4fe1b825137629a96dc58a1339bc4ece32041b755f99d638f21a153f2e7faed6",

   "modules": [
     {
       "name": "TextEncoder",
+      "precision": "fp16",
       "package": "TextEncoder.mlpackage",
       "compiled": "TextEncoder.mlmodelc",
       "shape_policy": "fixed T=128",
       "package_size_bytes": 18166801,
+      "compiled_size_bytes": 18179790,
       "package_files": [
         {
           "path": "TextEncoder.mlpackage/Data/com.apple.CoreML/model.mlmodel",
         }
       ],
       "compiled_files": [
+        {
+          "path": "TextEncoder.mlmodelc/.DS_Store",
+          "sha256": "cf4645f055711856ff0569b29315fb462026365385312c81e7db25c789e76e2b",
+          "size_bytes": 6148
+        },
         {
           "path": "TextEncoder.mlmodelc/analytics/coremldata.bin",
           "sha256": "b39c208b46f180dcbf4cfcd3b4437eeb93371fb77d6d636b99ca3ddff5327253",
     },
     {
       "name": "DurationPredictor",
+      "precision": "fp16",
       "package": "DurationPredictor.mlpackage",
       "compiled": "DurationPredictor.mlmodelc",
       "shape_policy": "fixed T=128",
       "package_size_bytes": 1892063,
+      "compiled_size_bytes": 1902563,
       "package_files": [
         {
           "path": "DurationPredictor.mlpackage/Data/com.apple.CoreML/model.mlmodel",
         }
       ],
       "compiled_files": [
+        {
+          "path": "DurationPredictor.mlmodelc/.DS_Store",
+          "sha256": "75c79ac936a1add17210b7cbec4962505cc557bc2f940e276b1236d35ff12dfa",
+          "size_bytes": 6148
+        },
         {
           "path": "DurationPredictor.mlmodelc/analytics/coremldata.bin",
           "sha256": "b836b72b388c2bc009a13bcc495cd08278a1742b1a866cdaf7faab7d80786bcb",
     },
     {
       "name": "VectorEstimator",
+      "precision": "fp16",
       "package": "VectorEstimator.mlpackage",
       "compiled": "VectorEstimator.mlmodelc",
       "shape_policy": "RangeDim L,T 17..512",
       "package_size_bytes": 127977306,
+      "compiled_size_bytes": 128000054,
       "package_files": [
         {
           "path": "VectorEstimator.mlpackage/Data/com.apple.CoreML/model.mlmodel",
         }
       ],
       "compiled_files": [
+        {
+          "path": "VectorEstimator.mlmodelc/.DS_Store",
+          "sha256": "d61692ef37d205b073915a28b0fcca8eb5360563286dc61d44acbd2af6b4a184",
+          "size_bytes": 6148
+        },
         {
           "path": "VectorEstimator.mlmodelc/analytics/coremldata.bin",
           "sha256": "1f43626d16b133a8b302f4f7a0e6d0f789422b22df135b7af11df25c385bebe0",
         }
       ]
     },
+    {
+      "name": "VectorEstimator_int8",
+      "precision": "int8 weight-only / fp16 activations (per-channel symmetric)",
+      "package": "VectorEstimator_int8.mlpackage",
+      "compiled": "VectorEstimator_int8.mlmodelc",
+      "shape_policy": "RangeDim L,T 17..512",
+      "package_size_bytes": 64530961,
+      "compiled_size_bytes": 64551709,
+      "package_files": [
+        {
+          "path": "VectorEstimator_int8.mlpackage/Data/com.apple.CoreML/model.mlmodel",
+          "sha256": "15d1760d354e6849c72ae2fd81805bf7121aab92567f12b0f230bf70268cc853",
+          "size_bytes": 345736
+        },
+        {
+          "path": "VectorEstimator_int8.mlpackage/Data/com.apple.CoreML/weights/weight.bin",
+          "sha256": "0e637c3a1725b5e1b3df09cc162d1822e8fe80fa92ba01c180dde2ca29c12523",
+          "size_bytes": 64184608
+        },
+        {
+          "path": "VectorEstimator_int8.mlpackage/Manifest.json",
+          "sha256": "32b9a6e8860f4fe2a5f873042621b59aac94af0d1a62e184514b40a16d258897",
+          "size_bytes": 617
+        }
+      ],
+      "compiled_files": [
+        {
+          "path": "VectorEstimator_int8.mlmodelc/analytics/coremldata.bin",
+          "sha256": "982da75910aca2f5f0e2f813bf7db9ec201fbf4c71668d0f0a99eb2f9da983d0",
+          "size_bytes": 243
+        },
+        {
+          "path": "VectorEstimator_int8.mlmodelc/coremldata.bin",
+          "sha256": "e93e9a59f6d4035aa07cf21ff06e160eb1bbcdad9b48d2ac413d11f3c2c26949",
+          "size_bytes": 633
+        },
+        {
+          "path": "VectorEstimator_int8.mlmodelc/model.mil",
+          "sha256": "4286e9f3ea253bdbca50e246d5568e9f89e130e05b7c8243c54d70f3611e5440",
+          "size_bytes": 366225
+        },
+        {
+          "path": "VectorEstimator_int8.mlmodelc/weights/weight.bin",
+          "sha256": "0e637c3a1725b5e1b3df09cc162d1822e8fe80fa92ba01c180dde2ca29c12523",
+          "size_bytes": 64184608
+        }
+      ]
+    },
     {
       "name": "Vocoder",
+      "precision": "fp16",
       "package": "Vocoder.mlpackage",
       "compiled": "Vocoder.mlmodelc",
       "shape_policy": "RangeDim L_ttl 4..512",
       "package_size_bytes": 50743824,
+      "compiled_size_bytes": 50754917,
       "package_files": [
         {
           "path": "Vocoder.mlpackage/Data/com.apple.CoreML/model.mlmodel",
         }
       ],
       "compiled_files": [
+        {
+          "path": "Vocoder.mlmodelc/.DS_Store",
+          "sha256": "aad4800457fdb1567500acd0f9eb203b27c71d29189891095b9d371f8c9c58b5",
+          "size_bytes": 6148
+        },
         {
           "path": "Vocoder.mlmodelc/analytics/coremldata.bin",
           "sha256": "4fe1b825137629a96dc58a1339bc4ece32041b755f99d638f21a153f2e7faed6",