OmniVoice / app.py
ailuntz's picture
Add OmniVoice MLX Space
0cf2f37 verified
#!/usr/bin/env python3
"""Hugging Face Space entry point for the OmniVoice MLX demo."""
import logging
import os
from typing import Any, Dict
import numpy as np
from omnivoice.cli.demo import build_demo
from omnivoice.mlx import OmniVoiceMLX
from omnivoice.models.omnivoice import OmniVoiceGenerationConfig
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
)
logging.getLogger("omnivoice").setLevel(logging.INFO)
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
CHECKPOINT = os.environ.get("OMNIVOICE_MODEL", "mlx-community/OmniVoice-4bit")
DTYPE = os.environ.get("OMNIVOICE_DTYPE", "float16")
AUDIO_TOKENIZER_DEVICE = os.environ.get("OMNIVOICE_AUDIO_TOKENIZER_DEVICE", "cpu")
print(f"Loading OmniVoice MLX model from {CHECKPOINT} ...", flush=True)
model = OmniVoiceMLX.from_pretrained(
CHECKPOINT,
dtype=DTYPE,
audio_tokenizer_device=AUDIO_TOKENIZER_DEVICE,
)
sampling_rate = model.sampling_rate
print("OmniVoice MLX model loaded.", flush=True)
def _gen_core(
text,
language,
ref_audio,
instruct,
num_step,
guidance_scale,
denoise,
speed,
duration,
preprocess_prompt,
postprocess_output,
mode,
ref_text=None,
):
if not text or not text.strip():
return None, "Please enter the text to synthesize."
gen_config = OmniVoiceGenerationConfig(
num_step=int(num_step or 32),
guidance_scale=float(guidance_scale) if guidance_scale is not None else 2.0,
denoise=bool(denoise) if denoise is not None else True,
preprocess_prompt=bool(preprocess_prompt),
postprocess_output=bool(postprocess_output),
)
lang = language if (language and language != "Auto") else None
kw: Dict[str, Any] = dict(
text=text.strip(),
language=lang,
generation_config=gen_config,
)
if speed is not None and float(speed) != 1.0:
kw["speed"] = float(speed)
if duration is not None and float(duration) > 0:
kw["duration"] = float(duration)
if mode == "clone":
if not ref_audio:
return None, "Please upload a reference audio."
kw["ref_audio"] = ref_audio
if ref_text and ref_text.strip():
kw["ref_text"] = ref_text.strip()
if instruct and instruct.strip():
kw["instruct"] = instruct.strip()
try:
audio = model.generate(**kw)
except Exception as exc:
return None, f"Error: {type(exc).__name__}: {exc}"
waveform = np.clip(audio[0], -1.0, 1.0)
waveform = (waveform * 32767).astype(np.int16)
return (sampling_rate, waveform), "Done."
demo = build_demo(model, CHECKPOINT, generate_fn=_gen_core)
if __name__ == "__main__":
demo.queue(max_size=8).launch()