alexwengg commited on
Commit
5d80477
Β·
verified Β·
1 Parent(s): f962799

Upload 4 files

Browse files
Files changed (3) hide show
  1. README.md +34 -3
  2. infer.py +213 -0
  3. requirements.txt +3 -0
README.md CHANGED
@@ -151,13 +151,44 @@ shipped β€” use `.mlmodelc` to skip the on‑device compile step on first load.
151
  - `unicode_indexer.json` β€” Unicode β†’ token id mapping (multilingual frontend).
152
  - `voice_styles/M1.json` β€” example voice style embedding (single male reference).
153
  - `manifest.json` β€” file inventory (sha256 + sizes) for both `.mlpackage` and `.mlmodelc`.
 
 
154
 
155
  ## Usage
156
 
157
- For quickest integration, use the FluidAudio Swift framework which handles
158
- model loading, text frontend, and the diffusion / vocoder loop.
159
 
160
- ### Swift (FluidAudio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  ```swift
163
  import AVFoundation
 
151
  - `unicode_indexer.json` β€” Unicode β†’ token id mapping (multilingual frontend).
152
  - `voice_styles/M1.json` β€” example voice style embedding (single male reference).
153
  - `manifest.json` β€” file inventory (sha256 + sizes) for both `.mlpackage` and `.mlmodelc`.
154
+ - `infer.py` β€” minimal self-contained Python demo (loads `.mlmodelc` / `.mlpackage` directly).
155
+ - `requirements.txt` β€” Python deps for `infer.py` (`coremltools`, `numpy`, `soundfile`).
156
 
157
  ## Usage
158
 
159
+ ### Quick test (Python)
 
160
 
161
+ For the curious / for sanity checking, this repo ships a small self‑contained
162
+ script `infer.py` that loads all four modules directly via `coremltools` and
163
+ writes a 44.1 kHz WAV. No external repo clone required.
164
+
165
+ ```bash
166
+ # 1. Download the repo (e.g. via huggingface_hub or `git lfs clone`).
167
+ git lfs clone https://huggingface.co/FluidInference/supertonic-3-coreml
168
+ cd supertonic-3-coreml
169
+
170
+ # 2. Install the 3 deps (macOS, Python 3.11+ recommended).
171
+ python -m venv .venv && source .venv/bin/activate
172
+ pip install -r requirements.txt
173
+
174
+ # 3. Synthesize.
175
+ python infer.py "Hello, world." --voice-style voice_styles/M1.json -o hello.wav
176
+ python infer.py "Bonjour le monde." --lang fr --voice-style voice_styles/M1.json -o fr.wav
177
+
178
+ # Optional: pick a compute unit explicitly.
179
+ python infer.py "Test" --compute-units CPU_AND_NE -o ne.wav
180
+ ```
181
+
182
+ The Python script loads `.mlpackage` (which is what `coremltools` accepts);
183
+ the `.mlmodelc` bundles are for direct Swift / Objective‑C use
184
+ (`MLModel(contentsOf:)`) where they skip the on‑device compile step.
185
+
186
+ ### Production (Swift / FluidAudio)
187
+
188
+ For production use, the FluidAudio Swift framework handles model loading,
189
+ text frontend, batching, chunking, and the diffusion / vocoder loop.
190
+
191
+ #### Swift (FluidAudio)
192
 
193
  ```swift
194
  import AVFoundation
infer.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Minimal self-contained Supertonic-3 CoreML inference script.
2
+
3
+ Loads the four .mlpackage modules from this directory, tokenizes text via
4
+ unicode_indexer.json, runs the 8-step flow-matching loop, and writes a 44.1 kHz
5
+ WAV. No external dependencies beyond `coremltools`, `numpy`, and `soundfile`.
6
+
7
+ Example
8
+ -------
9
+ python infer.py "Hello, world." --voice-style voice_styles/M1.json -o hello.wav
10
+ python infer.py "Bonjour le monde." --lang fr --voice-style voice_styles/M1.json -o fr.wav
11
+
12
+ For the full driver (text chunking, batch synthesis, multi-utt) see the
13
+ mobius conversion repo: github.com/FluidInference/mobius
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import argparse
19
+ import json
20
+ import re
21
+ import time
22
+ from pathlib import Path
23
+ from typing import Tuple
24
+ from unicodedata import normalize
25
+
26
+ import coremltools as ct
27
+ import numpy as np
28
+
29
+
30
+ # Languages supported by Supertonic-3 v1.7.3.
31
+ AVAILABLE_LANGS = [
32
+ "en", "ko", "ja", "ar", "bg", "cs", "da", "de", "el", "es",
33
+ "et", "fi", "fr", "hi", "hr", "hu", "id", "it", "lt", "lv",
34
+ "nl", "pl", "pt", "ro", "ru", "sk", "sl", "sv", "tr", "uk",
35
+ "vi", "na",
36
+ ]
37
+
38
+ # CoreML shape pins (must match conversion settings; see mobius trials.md).
39
+ TEXT_T_FIXED = 128 # text_encoder / duration_predictor pinned T
40
+ VEC_EST_L_MIN = 17 # vector_estimator latent/text RangeDim lower bound
41
+
42
+
43
+ _EMOJI_RE = re.compile(
44
+ "[\U0001f600-\U0001f64f\U0001f300-\U0001f5ff\U0001f680-\U0001f6ff"
45
+ "\U0001f700-\U0001f77f\U0001f780-\U0001f7ff\U0001f800-\U0001f8ff"
46
+ "\U0001f900-\U0001f9ff\U0001fa00-\U0001fa6f\U0001fa70-\U0001faff"
47
+ "\u2600-\u26ff\u2700-\u27bf\U0001f1e6-\U0001f1ff]+",
48
+ flags=re.UNICODE,
49
+ )
50
+ _CHAR_REPL = {
51
+ "–": "-", "‑": "-", "β€”": "-", "_": " ",
52
+ "\u201c": '"', "\u201d": '"', "\u2018": "'", "\u2019": "'",
53
+ "Β΄": "'", "`": "'",
54
+ "[": " ", "]": " ", "|": " ", "/": " ", "#": " ", "β†’": " ", "←": " ",
55
+ }
56
+
57
+
58
+ def preprocess_text(text: str, lang: str) -> str:
59
+ text = normalize("NFKD", text)
60
+ text = _EMOJI_RE.sub("", text)
61
+ for k, v in _CHAR_REPL.items():
62
+ text = text.replace(k, v)
63
+ text = re.sub(r"\s+", " ", text).strip()
64
+ if not re.search(r"[.!?;:,'\"')\]}…。」』】〉》›»]$", text):
65
+ text += "."
66
+ if lang not in AVAILABLE_LANGS:
67
+ raise ValueError(f"Unsupported lang '{lang}'. Available: {AVAILABLE_LANGS}")
68
+ return f"<{lang}>" + text + f"</{lang}>"
69
+
70
+
71
+ def tokenize(text: str, lang: str, indexer: list) -> Tuple[np.ndarray, np.ndarray]:
72
+ """Convert text to (text_ids[1, T], text_mask[1, 1, T]) padded to TEXT_T_FIXED."""
73
+ s = preprocess_text(text, lang)
74
+ ids = np.zeros((1, TEXT_T_FIXED), dtype=np.int32)
75
+ mask = np.zeros((1, 1, TEXT_T_FIXED), dtype=np.float32)
76
+ codepoints = [ord(c) for c in s][:TEXT_T_FIXED]
77
+ for i, cp in enumerate(codepoints):
78
+ ids[0, i] = indexer[cp]
79
+ mask[0, 0, : len(codepoints)] = 1.0
80
+ return ids, mask
81
+
82
+
83
+ def load_voice_style(path: Path) -> Tuple[np.ndarray, np.ndarray]:
84
+ with open(path) as f:
85
+ cfg = json.load(f)
86
+ ttl_d = cfg["style_ttl"]["dims"]
87
+ dp_d = cfg["style_dp"]["dims"]
88
+ ttl = np.array(cfg["style_ttl"]["data"], dtype=np.float32).reshape(1, ttl_d[1], ttl_d[2])
89
+ dp = np.array(cfg["style_dp"]["data"], dtype=np.float32).reshape(1, dp_d[1], dp_d[2])
90
+ return ttl, dp
91
+
92
+
93
+ def sample_noisy_latent(
94
+ duration_sec: float, sample_rate: int, base_chunk_size: int,
95
+ chunk_compress_factor: int, latent_dim: int, rng: np.random.Generator,
96
+ ) -> Tuple[np.ndarray, np.ndarray]:
97
+ wav_len = int(duration_sec * sample_rate)
98
+ chunk_size = base_chunk_size * chunk_compress_factor
99
+ L = (wav_len + chunk_size - 1) // chunk_size
100
+ noisy = rng.standard_normal((1, latent_dim * chunk_compress_factor, L)).astype(np.float32)
101
+ latent_mask = np.zeros((1, 1, L), dtype=np.float32)
102
+ latent_mask[0, 0, :L] = 1.0
103
+ return noisy * latent_mask, latent_mask
104
+
105
+
106
+ def pad_last(arr: np.ndarray, target: int) -> np.ndarray:
107
+ if arr.shape[-1] >= target:
108
+ return arr
109
+ pad = [(0, 0)] * arr.ndim
110
+ pad[-1] = (0, target - arr.shape[-1])
111
+ return np.pad(arr, pad, constant_values=0.0)
112
+
113
+
114
+ class Supertonic3TTS:
115
+ def __init__(self, model_dir: Path, compute_units: ct.ComputeUnit = ct.ComputeUnit.CPU_AND_NE):
116
+ with open(model_dir / "tts.json") as f:
117
+ cfg = json.load(f)
118
+ self.sample_rate = int(cfg["ae"]["sample_rate"])
119
+ self.base_chunk_size = int(cfg["ae"]["base_chunk_size"])
120
+ self.ccf = int(cfg["ttl"]["chunk_compress_factor"])
121
+ self.ldim = int(cfg["ttl"]["latent_dim"])
122
+
123
+ with open(model_dir / "unicode_indexer.json") as f:
124
+ self.indexer = json.load(f)
125
+
126
+ def _load(name: str) -> ct.models.MLModel:
127
+ # coremltools loads .mlpackage; .mlmodelc is for direct Swift/Obj-C use.
128
+ return ct.models.MLModel(
129
+ str(model_dir / f"{name}.mlpackage"),
130
+ compute_units=compute_units,
131
+ )
132
+
133
+ print(f"Loading models from {model_dir} (compute_units={compute_units.name})")
134
+ self.dp = _load("DurationPredictor")
135
+ self.te = _load("TextEncoder")
136
+ self.ve = _load("VectorEstimator")
137
+ self.vc = _load("Vocoder")
138
+ self.rng = np.random.default_rng()
139
+
140
+ def synthesize(self, text: str, voice_style_path: Path, lang: str = "en",
141
+ total_step: int = 8, speed: float = 1.05) -> Tuple[np.ndarray, float]:
142
+ ttl, dp_style = load_voice_style(voice_style_path)
143
+ text_ids, text_mask = tokenize(text, lang, self.indexer)
144
+
145
+ # 1. Duration.
146
+ dp_out = self.dp.predict({
147
+ "text_ids": text_ids, "style_dp": dp_style, "text_mask": text_mask,
148
+ })
149
+ duration = float(np.asarray(dp_out["duration"], dtype=np.float32)[0]) / speed
150
+
151
+ # 2. Text embedding.
152
+ te_out = self.te.predict({
153
+ "text_ids": text_ids, "style_ttl": ttl, "text_mask": text_mask,
154
+ })
155
+ text_emb = np.asarray(te_out["text_emb"], dtype=np.float32)
156
+
157
+ # 3. Noisy latent.
158
+ noisy, latent_mask = sample_noisy_latent(
159
+ duration, self.sample_rate, self.base_chunk_size, self.ccf, self.ldim, self.rng,
160
+ )
161
+ L_true = noisy.shape[-1]
162
+ L_use = max(L_true, VEC_EST_L_MIN)
163
+ noisy = pad_last(noisy, L_use)
164
+ latent_mask = pad_last(latent_mask, L_use)
165
+
166
+ # 4. 8-step flow-matching diffusion.
167
+ xt = noisy
168
+ total_t = np.array([float(total_step)], dtype=np.float32)
169
+ for step in range(total_step):
170
+ cur_t = np.array([float(step)], dtype=np.float32)
171
+ ve_out = self.ve.predict({
172
+ "noisy_latent": xt, "text_emb": text_emb, "style_ttl": ttl,
173
+ "latent_mask": latent_mask, "text_mask": text_mask,
174
+ "current_step": cur_t, "total_step": total_t,
175
+ })
176
+ xt = np.asarray(ve_out["denoised_latent"], dtype=np.float32)
177
+
178
+ # 5. Vocoder β†’ 44.1 kHz wav.
179
+ vc_out = self.vc.predict({"latent": xt})
180
+ wav = np.asarray(vc_out["wav"], dtype=np.float32)
181
+ wav = wav[:, : (self.base_chunk_size * self.ccf) * L_true] # trim pad
182
+ wav = wav[0, : int(self.sample_rate * duration)] # trim per-sample
183
+ return wav, duration
184
+
185
+
186
+ def main() -> None:
187
+ ap = argparse.ArgumentParser(description="Supertonic-3 CoreML TTS β€” minimal demo")
188
+ ap.add_argument("text", type=str, help="Text to synthesize")
189
+ ap.add_argument("--voice-style", type=Path, default=Path("voice_styles/M1.json"))
190
+ ap.add_argument("--lang", type=str, default="en")
191
+ ap.add_argument("--model-dir", type=Path, default=Path("."))
192
+ ap.add_argument("-o", "--output", type=Path, default=Path("output.wav"))
193
+ ap.add_argument("--total-step", type=int, default=8)
194
+ ap.add_argument("--speed", type=float, default=1.05)
195
+ ap.add_argument("--compute-units", type=str, default="CPU_AND_NE",
196
+ choices=["CPU_ONLY", "CPU_AND_GPU", "CPU_AND_NE", "ALL"])
197
+ args = ap.parse_args()
198
+
199
+ try:
200
+ import soundfile as sf
201
+ except ImportError as e:
202
+ raise SystemExit("install soundfile: pip install soundfile") from e
203
+
204
+ tts = Supertonic3TTS(args.model_dir, getattr(ct.ComputeUnit, args.compute_units))
205
+ t0 = time.time()
206
+ wav, dur = tts.synthesize(args.text, args.voice_style, args.lang, args.total_step, args.speed)
207
+ elapsed = time.time() - t0
208
+ sf.write(args.output, wav, tts.sample_rate)
209
+ print(f"wrote {args.output} ({dur:.2f}s audio in {elapsed:.2f}s, RTFx {dur / elapsed:.1f}x)")
210
+
211
+
212
+ if __name__ == "__main__":
213
+ main()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ coremltools>=8.0
2
+ numpy>=1.24
3
+ soundfile>=0.12