FluidInference
/

supertonic-3-coreml

@@ -151,13 +151,44 @@ shipped — use `.mlmodelc` to skip the on‑device compile step on first load.
 - `unicode_indexer.json`         — Unicode → token id mapping (multilingual frontend).
 - `voice_styles/M1.json`         — example voice style embedding (single male reference).
 - `manifest.json`                — file inventory (sha256 + sizes) for both `.mlpackage` and `.mlmodelc`.
 ## Usage
-For quickest integration, use the FluidAudio Swift framework which handles
-model loading, text frontend, and the diffusion / vocoder loop.
-### Swift (FluidAudio)
 ```swift
 import AVFoundation

 - `unicode_indexer.json`         — Unicode → token id mapping (multilingual frontend).
 - `voice_styles/M1.json`         — example voice style embedding (single male reference).
 - `manifest.json`                — file inventory (sha256 + sizes) for both `.mlpackage` and `.mlmodelc`.
+- `infer.py`                     — minimal self-contained Python demo (loads `.mlmodelc` / `.mlpackage` directly).
+- `requirements.txt`             — Python deps for `infer.py` (`coremltools`, `numpy`, `soundfile`).
 ## Usage
+### Quick test (Python)
+For the curious / for sanity checking, this repo ships a small self‑contained
+script `infer.py` that loads all four modules directly via `coremltools` and
+writes a 44.1 kHz WAV. No external repo clone required.
+```bash
+# 1. Download the repo (e.g. via huggingface_hub or `git lfs clone`).
+git lfs clone https://huggingface.co/FluidInference/supertonic-3-coreml
+cd supertonic-3-coreml
+# 2. Install the 3 deps (macOS, Python 3.11+ recommended).
+python -m venv .venv && source .venv/bin/activate
+pip install -r requirements.txt
+# 3. Synthesize.
+python infer.py "Hello, world." --voice-style voice_styles/M1.json -o hello.wav
+python infer.py "Bonjour le monde." --lang fr --voice-style voice_styles/M1.json -o fr.wav
+# Optional: pick a compute unit explicitly.
+python infer.py "Test" --compute-units CPU_AND_NE -o ne.wav
+```
+The Python script loads `.mlpackage` (which is what `coremltools` accepts);
+the `.mlmodelc` bundles are for direct Swift / Objective‑C use
+(`MLModel(contentsOf:)`) where they skip the on‑device compile step.
+### Production (Swift / FluidAudio)
+For production use, the FluidAudio Swift framework handles model loading,
+text frontend, batching, chunking, and the diffusion / vocoder loop.
+#### Swift (FluidAudio)
 ```swift
 import AVFoundation

infer.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""Minimal self-contained Supertonic-3 CoreML inference script.
+Loads the four .mlpackage modules from this directory, tokenizes text via
+unicode_indexer.json, runs the 8-step flow-matching loop, and writes a 44.1 kHz
+WAV. No external dependencies beyond `coremltools`, `numpy`, and `soundfile`.
+Example
+-------
+    python infer.py "Hello, world." --voice-style voice_styles/M1.json -o hello.wav
+    python infer.py "Bonjour le monde." --lang fr --voice-style voice_styles/M1.json -o fr.wav
+For the full driver (text chunking, batch synthesis, multi-utt) see the
+mobius conversion repo: github.com/FluidInference/mobius
+"""
+from __future__ import annotations
+import argparse
+import json
+import re
+import time
+from pathlib import Path
+from typing import Tuple
+from unicodedata import normalize
+import coremltools as ct
+import numpy as np
+# Languages supported by Supertonic-3 v1.7.3.
+AVAILABLE_LANGS = [
+    "en", "ko", "ja", "ar", "bg", "cs", "da", "de", "el", "es",
+    "et", "fi", "fr", "hi", "hr", "hu", "id", "it", "lt", "lv",
+    "nl", "pl", "pt", "ro", "ru", "sk", "sl", "sv", "tr", "uk",
+    "vi", "na",
+]
+# CoreML shape pins (must match conversion settings; see mobius trials.md).
+TEXT_T_FIXED = 128         # text_encoder / duration_predictor pinned T
+VEC_EST_L_MIN = 17         # vector_estimator latent/text RangeDim lower bound
+_EMOJI_RE = re.compile(
+    "[\U0001f600-\U0001f64f\U0001f300-\U0001f5ff\U0001f680-\U0001f6ff"
+    "\U0001f700-\U0001f77f\U0001f780-\U0001f7ff\U0001f800-\U0001f8ff"
+    "\U0001f900-\U0001f9ff\U0001fa00-\U0001fa6f\U0001fa70-\U0001faff"
+    "\u2600-\u26ff\u2700-\u27bf\U0001f1e6-\U0001f1ff]+",
+    flags=re.UNICODE,
+)
+_CHAR_REPL = {
+    "–": "-", "‑": "-", "—": "-", "_": " ",
+    "\u201c": '"', "\u201d": '"', "\u2018": "'", "\u2019": "'",
+    "´": "'", "`": "'",
+    "[": " ", "]": " ", "|": " ", "/": " ", "#": " ", "→": " ", "←": " ",
+}
+def preprocess_text(text: str, lang: str) -> str:
+    text = normalize("NFKD", text)
+    text = _EMOJI_RE.sub("", text)
+    for k, v in _CHAR_REPL.items():
+        text = text.replace(k, v)
+    text = re.sub(r"\s+", " ", text).strip()
+    if not re.search(r"[.!?;:,'\"')\]}…。」』】〉》›»]$", text):
+        text += "."
+    if lang not in AVAILABLE_LANGS:
+        raise ValueError(f"Unsupported lang '{lang}'. Available: {AVAILABLE_LANGS}")
+    return f"<{lang}>" + text + f"</{lang}>"
+def tokenize(text: str, lang: str, indexer: list) -> Tuple[np.ndarray, np.ndarray]:
+    """Convert text to (text_ids[1, T], text_mask[1, 1, T]) padded to TEXT_T_FIXED."""
+    s = preprocess_text(text, lang)
+    ids = np.zeros((1, TEXT_T_FIXED), dtype=np.int32)
+    mask = np.zeros((1, 1, TEXT_T_FIXED), dtype=np.float32)
+    codepoints = [ord(c) for c in s][:TEXT_T_FIXED]
+    for i, cp in enumerate(codepoints):
+        ids[0, i] = indexer[cp]
+    mask[0, 0, : len(codepoints)] = 1.0
+    return ids, mask
+def load_voice_style(path: Path) -> Tuple[np.ndarray, np.ndarray]:
+    with open(path) as f:
+        cfg = json.load(f)
+    ttl_d = cfg["style_ttl"]["dims"]
+    dp_d = cfg["style_dp"]["dims"]
+    ttl = np.array(cfg["style_ttl"]["data"], dtype=np.float32).reshape(1, ttl_d[1], ttl_d[2])
+    dp = np.array(cfg["style_dp"]["data"], dtype=np.float32).reshape(1, dp_d[1], dp_d[2])
+    return ttl, dp
+def sample_noisy_latent(
+    duration_sec: float, sample_rate: int, base_chunk_size: int,
+    chunk_compress_factor: int, latent_dim: int, rng: np.random.Generator,
+) -> Tuple[np.ndarray, np.ndarray]:
+    wav_len = int(duration_sec * sample_rate)
+    chunk_size = base_chunk_size * chunk_compress_factor
+    L = (wav_len + chunk_size - 1) // chunk_size
+    noisy = rng.standard_normal((1, latent_dim * chunk_compress_factor, L)).astype(np.float32)
+    latent_mask = np.zeros((1, 1, L), dtype=np.float32)
+    latent_mask[0, 0, :L] = 1.0
+    return noisy * latent_mask, latent_mask
+def pad_last(arr: np.ndarray, target: int) -> np.ndarray:
+    if arr.shape[-1] >= target:
+        return arr
+    pad = [(0, 0)] * arr.ndim
+    pad[-1] = (0, target - arr.shape[-1])
+    return np.pad(arr, pad, constant_values=0.0)
+class Supertonic3TTS:
+    def __init__(self, model_dir: Path, compute_units: ct.ComputeUnit = ct.ComputeUnit.CPU_AND_NE):
+        with open(model_dir / "tts.json") as f:
+            cfg = json.load(f)
+        self.sample_rate = int(cfg["ae"]["sample_rate"])
+        self.base_chunk_size = int(cfg["ae"]["base_chunk_size"])
+        self.ccf = int(cfg["ttl"]["chunk_compress_factor"])
+        self.ldim = int(cfg["ttl"]["latent_dim"])
+        with open(model_dir / "unicode_indexer.json") as f:
+            self.indexer = json.load(f)
+        def _load(name: str) -> ct.models.MLModel:
+            # coremltools loads .mlpackage; .mlmodelc is for direct Swift/Obj-C use.
+            return ct.models.MLModel(
+                str(model_dir / f"{name}.mlpackage"),
+                compute_units=compute_units,
+            )
+        print(f"Loading models from {model_dir} (compute_units={compute_units.name})")
+        self.dp = _load("DurationPredictor")
+        self.te = _load("TextEncoder")
+        self.ve = _load("VectorEstimator")
+        self.vc = _load("Vocoder")
+        self.rng = np.random.default_rng()
+    def synthesize(self, text: str, voice_style_path: Path, lang: str = "en",
+                   total_step: int = 8, speed: float = 1.05) -> Tuple[np.ndarray, float]:
+        ttl, dp_style = load_voice_style(voice_style_path)
+        text_ids, text_mask = tokenize(text, lang, self.indexer)
+        # 1. Duration.
+        dp_out = self.dp.predict({
+            "text_ids": text_ids, "style_dp": dp_style, "text_mask": text_mask,
+        })
+        duration = float(np.asarray(dp_out["duration"], dtype=np.float32)[0]) / speed
+        # 2. Text embedding.
+        te_out = self.te.predict({
+            "text_ids": text_ids, "style_ttl": ttl, "text_mask": text_mask,
+        })
+        text_emb = np.asarray(te_out["text_emb"], dtype=np.float32)
+        # 3. Noisy latent.
+        noisy, latent_mask = sample_noisy_latent(
+            duration, self.sample_rate, self.base_chunk_size, self.ccf, self.ldim, self.rng,
+        )
+        L_true = noisy.shape[-1]
+        L_use = max(L_true, VEC_EST_L_MIN)
+        noisy = pad_last(noisy, L_use)
+        latent_mask = pad_last(latent_mask, L_use)
+        # 4. 8-step flow-matching diffusion.
+        xt = noisy
+        total_t = np.array([float(total_step)], dtype=np.float32)
+        for step in range(total_step):
+            cur_t = np.array([float(step)], dtype=np.float32)
+            ve_out = self.ve.predict({
+                "noisy_latent": xt, "text_emb": text_emb, "style_ttl": ttl,
+                "latent_mask": latent_mask, "text_mask": text_mask,
+                "current_step": cur_t, "total_step": total_t,
+            })
+            xt = np.asarray(ve_out["denoised_latent"], dtype=np.float32)
+        # 5. Vocoder → 44.1 kHz wav.
+        vc_out = self.vc.predict({"latent": xt})
+        wav = np.asarray(vc_out["wav"], dtype=np.float32)
+        wav = wav[:, : (self.base_chunk_size * self.ccf) * L_true]  # trim pad
+        wav = wav[0, : int(self.sample_rate * duration)]            # trim per-sample
+        return wav, duration
+def main() -> None:
+    ap = argparse.ArgumentParser(description="Supertonic-3 CoreML TTS — minimal demo")
+    ap.add_argument("text", type=str, help="Text to synthesize")
+    ap.add_argument("--voice-style", type=Path, default=Path("voice_styles/M1.json"))
+    ap.add_argument("--lang", type=str, default="en")
+    ap.add_argument("--model-dir", type=Path, default=Path("."))
+    ap.add_argument("-o", "--output", type=Path, default=Path("output.wav"))
+    ap.add_argument("--total-step", type=int, default=8)
+    ap.add_argument("--speed", type=float, default=1.05)
+    ap.add_argument("--compute-units", type=str, default="CPU_AND_NE",
+                    choices=["CPU_ONLY", "CPU_AND_GPU", "CPU_AND_NE", "ALL"])
+    args = ap.parse_args()
+    try:
+        import soundfile as sf
+    except ImportError as e:
+        raise SystemExit("install soundfile: pip install soundfile") from e
+    tts = Supertonic3TTS(args.model_dir, getattr(ct.ComputeUnit, args.compute_units))
+    t0 = time.time()
+    wav, dur = tts.synthesize(args.text, args.voice_style, args.lang, args.total_step, args.speed)
+    elapsed = time.time() - t0
+    sf.write(args.output, wav, tts.sample_rate)
+    print(f"wrote {args.output}  ({dur:.2f}s audio in {elapsed:.2f}s, RTFx {dur / elapsed:.1f}x)")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+coremltools>=8.0
+numpy>=1.24
+soundfile>=0.12