Spaces:
Running on Zero
Running on Zero
update to 0.1.3 version
Browse files- omnivoice/cli/demo.py +16 -7
- omnivoice/eval/mos/utmos.py +0 -0
- omnivoice/eval/speaker_similarity/sim.py +0 -0
- omnivoice/eval/wer/fleurs.py +0 -0
- omnivoice/eval/wer/hubert.py +0 -0
- omnivoice/eval/wer/minimax.py +0 -0
- omnivoice/eval/wer/seedtts.py +0 -0
- omnivoice/models/omnivoice.py +66 -25
- omnivoice/utils/audio.py +3 -1
omnivoice/cli/demo.py
CHANGED
|
@@ -136,6 +136,13 @@ def build_parser() -> argparse.ArgumentParser:
|
|
| 136 |
parser.add_argument(
|
| 137 |
"--share", action="store_true", default=False, help="Create public link."
|
| 138 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
return parser
|
| 140 |
|
| 141 |
|
|
@@ -198,9 +205,8 @@ def build_demo(
|
|
| 198 |
ref_text=ref_text,
|
| 199 |
)
|
| 200 |
|
| 201 |
-
if
|
| 202 |
-
|
| 203 |
-
kw["instruct"] = instruct.strip()
|
| 204 |
|
| 205 |
try:
|
| 206 |
audio = model.generate(**kw)
|
|
@@ -303,7 +309,7 @@ State-of-the-art text-to-speech model for **600+ languages**, supporting:
|
|
| 303 |
- **Voice Design** — Create custom voices with speaker attributes
|
| 304 |
|
| 305 |
Built with [OmniVoice](https://github.com/k2-fsa/OmniVoice)
|
| 306 |
-
by Xiaomi Next-gen Kaldi team.
|
| 307 |
"""
|
| 308 |
)
|
| 309 |
|
|
@@ -336,6 +342,8 @@ by Xiaomi Next-gen Kaldi team.
|
|
| 336 |
" to auto-transcribe via ASR models.",
|
| 337 |
)
|
| 338 |
vc_lang = _lang_dropdown("Language (optional) / 语种 (可选)")
|
|
|
|
|
|
|
| 339 |
(
|
| 340 |
vc_ns,
|
| 341 |
vc_gs,
|
|
@@ -354,13 +362,13 @@ by Xiaomi Next-gen Kaldi team.
|
|
| 354 |
vc_status = gr.Textbox(label="Status / 状态", lines=2)
|
| 355 |
|
| 356 |
def _clone_fn(
|
| 357 |
-
text, lang, ref_aud, ref_text, ns, gs, dn, sp, du, pp, po
|
| 358 |
):
|
| 359 |
return _gen(
|
| 360 |
text,
|
| 361 |
lang,
|
| 362 |
ref_aud,
|
| 363 |
-
|
| 364 |
ns,
|
| 365 |
gs,
|
| 366 |
dn,
|
|
@@ -379,6 +387,7 @@ by Xiaomi Next-gen Kaldi team.
|
|
| 379 |
vc_lang,
|
| 380 |
vc_ref_audio,
|
| 381 |
vc_ref_text,
|
|
|
|
| 382 |
vc_ns,
|
| 383 |
vc_gs,
|
| 384 |
vc_dn,
|
|
@@ -514,7 +523,7 @@ def main(argv=None) -> int:
|
|
| 514 |
checkpoint,
|
| 515 |
device_map=device,
|
| 516 |
dtype=torch.float16,
|
| 517 |
-
load_asr=
|
| 518 |
)
|
| 519 |
print("Model loaded.")
|
| 520 |
|
|
|
|
| 136 |
parser.add_argument(
|
| 137 |
"--share", action="store_true", default=False, help="Create public link."
|
| 138 |
)
|
| 139 |
+
parser.add_argument(
|
| 140 |
+
"--no-asr",
|
| 141 |
+
action="store_true",
|
| 142 |
+
default=False,
|
| 143 |
+
help="Skip loading Whisper ASR model. Reference text auto-transcription"
|
| 144 |
+
" will be unavailable.",
|
| 145 |
+
)
|
| 146 |
return parser
|
| 147 |
|
| 148 |
|
|
|
|
| 205 |
ref_text=ref_text,
|
| 206 |
)
|
| 207 |
|
| 208 |
+
if instruct and instruct.strip():
|
| 209 |
+
kw["instruct"] = instruct.strip()
|
|
|
|
| 210 |
|
| 211 |
try:
|
| 212 |
audio = model.generate(**kw)
|
|
|
|
| 309 |
- **Voice Design** — Create custom voices with speaker attributes
|
| 310 |
|
| 311 |
Built with [OmniVoice](https://github.com/k2-fsa/OmniVoice)
|
| 312 |
+
by Xiaomi AI Lab Next-gen Kaldi team.
|
| 313 |
"""
|
| 314 |
)
|
| 315 |
|
|
|
|
| 342 |
" to auto-transcribe via ASR models.",
|
| 343 |
)
|
| 344 |
vc_lang = _lang_dropdown("Language (optional) / 语种 (可选)")
|
| 345 |
+
with gr.Accordion("Instruct (optional)", open=False):
|
| 346 |
+
vc_instruct = gr.Textbox(label="Instruct", lines=2)
|
| 347 |
(
|
| 348 |
vc_ns,
|
| 349 |
vc_gs,
|
|
|
|
| 362 |
vc_status = gr.Textbox(label="Status / 状态", lines=2)
|
| 363 |
|
| 364 |
def _clone_fn(
|
| 365 |
+
text, lang, ref_aud, ref_text, instruct, ns, gs, dn, sp, du, pp, po
|
| 366 |
):
|
| 367 |
return _gen(
|
| 368 |
text,
|
| 369 |
lang,
|
| 370 |
ref_aud,
|
| 371 |
+
instruct,
|
| 372 |
ns,
|
| 373 |
gs,
|
| 374 |
dn,
|
|
|
|
| 387 |
vc_lang,
|
| 388 |
vc_ref_audio,
|
| 389 |
vc_ref_text,
|
| 390 |
+
vc_instruct,
|
| 391 |
vc_ns,
|
| 392 |
vc_gs,
|
| 393 |
vc_dn,
|
|
|
|
| 523 |
checkpoint,
|
| 524 |
device_map=device,
|
| 525 |
dtype=torch.float16,
|
| 526 |
+
load_asr=not args.no_asr,
|
| 527 |
)
|
| 528 |
print("Model loaded.")
|
| 529 |
|
omnivoice/eval/mos/utmos.py
CHANGED
|
File without changes
|
omnivoice/eval/speaker_similarity/sim.py
CHANGED
|
File without changes
|
omnivoice/eval/wer/fleurs.py
CHANGED
|
File without changes
|
omnivoice/eval/wer/hubert.py
CHANGED
|
File without changes
|
omnivoice/eval/wer/minimax.py
CHANGED
|
File without changes
|
omnivoice/eval/wer/seedtts.py
CHANGED
|
File without changes
|
omnivoice/models/omnivoice.py
CHANGED
|
@@ -630,15 +630,9 @@ class OmniVoice(PreTrainedModel):
|
|
| 630 |
# Skip trimming when ref_text is user-provided, otherwise the
|
| 631 |
# trimmed audio will no longer match the full transcript.
|
| 632 |
if ref_text is None:
|
| 633 |
-
ref_wav = trim_long_audio(
|
| 634 |
-
|
| 635 |
-
logger.warning(
|
| 636 |
-
"Reference audio is %.1fs long (>20s) and ref_text was "
|
| 637 |
-
"provided, so automatic trimming is skipped. A long reference "
|
| 638 |
-
"may cause slower generation and degraded quality.",
|
| 639 |
-
ref_wav.size(-1) / self.sampling_rate,
|
| 640 |
)
|
| 641 |
-
|
| 642 |
ref_wav = remove_silence(
|
| 643 |
ref_wav,
|
| 644 |
self.sampling_rate,
|
|
@@ -652,6 +646,15 @@ class OmniVoice(PreTrainedModel):
|
|
| 652 |
"Try setting preprocess_prompt=False."
|
| 653 |
)
|
| 654 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 655 |
# Auto-transcribe if ref_text not provided
|
| 656 |
if ref_text is None:
|
| 657 |
if self._asr_pipe is None:
|
|
@@ -1073,12 +1076,10 @@ class OmniVoice(PreTrainedModel):
|
|
| 1073 |
|
| 1074 |
# Build text tokens
|
| 1075 |
full_text = _combine_text(ref_text=ref_text, text=text)
|
|
|
|
| 1076 |
text_tokens = (
|
| 1077 |
-
self.text_tokenizer
|
| 1078 |
-
|
| 1079 |
-
return_tensors="pt",
|
| 1080 |
-
)
|
| 1081 |
-
.input_ids.repeat(self.config.num_audio_codebook, 1)
|
| 1082 |
.unsqueeze(0)
|
| 1083 |
).to(
|
| 1084 |
self.device
|
|
@@ -1490,6 +1491,53 @@ def _get_time_steps(
|
|
| 1490 |
return timesteps
|
| 1491 |
|
| 1492 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1493 |
def _combine_text(text, ref_text: Optional[str] = None) -> str:
|
| 1494 |
|
| 1495 |
# combine with reference text if not None
|
|
@@ -1498,24 +1546,17 @@ def _combine_text(text, ref_text: Optional[str] = None) -> str:
|
|
| 1498 |
else:
|
| 1499 |
full_text = text.strip()
|
| 1500 |
|
| 1501 |
-
#
|
| 1502 |
-
full_text = re.sub(r"[
|
|
|
|
|
|
|
|
|
|
| 1503 |
|
| 1504 |
# remove spaces around chinese characters
|
| 1505 |
chinese_range = r"[\u4e00-\u9fff]"
|
| 1506 |
pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})"
|
| 1507 |
full_text = re.sub(pattern, "", full_text)
|
| 1508 |
|
| 1509 |
-
# Remove whitespace immediately before special emotion tags (except
|
| 1510 |
-
# [laughter]). During training these tags have no preceding space, so
|
| 1511 |
-
# the text tokenizer would mis-tokenise them if spaces were present.
|
| 1512 |
-
_EMOTION_TAGS = (
|
| 1513 |
-
r"sigh|confirmation-en|question-en|question-ah|question-oh|"
|
| 1514 |
-
r"question-ei|question-yi|surprise-ah|surprise-oh|surprise-wa|"
|
| 1515 |
-
r"surprise-yo|dissatisfaction-hnn"
|
| 1516 |
-
)
|
| 1517 |
-
full_text = re.sub(rf"\s+(\[({_EMOTION_TAGS})\])", r"\1", full_text)
|
| 1518 |
-
|
| 1519 |
return full_text
|
| 1520 |
|
| 1521 |
|
|
|
|
| 630 |
# Skip trimming when ref_text is user-provided, otherwise the
|
| 631 |
# trimmed audio will no longer match the full transcript.
|
| 632 |
if ref_text is None:
|
| 633 |
+
ref_wav = trim_long_audio(
|
| 634 |
+
ref_wav, self.sampling_rate, trim_threshold=20.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 635 |
)
|
|
|
|
| 636 |
ref_wav = remove_silence(
|
| 637 |
ref_wav,
|
| 638 |
self.sampling_rate,
|
|
|
|
| 646 |
"Try setting preprocess_prompt=False."
|
| 647 |
)
|
| 648 |
|
| 649 |
+
ref_duration = ref_wav.size(-1) / self.sampling_rate
|
| 650 |
+
if ref_duration > 20.0:
|
| 651 |
+
logger.warning(
|
| 652 |
+
"Reference audio is %.1fs long (>20s). This may cause slower "
|
| 653 |
+
"generation, higher memory usage, and degraded voice cloning "
|
| 654 |
+
"quality. We recommend trimming it to 3-10s.",
|
| 655 |
+
ref_duration,
|
| 656 |
+
)
|
| 657 |
+
|
| 658 |
# Auto-transcribe if ref_text not provided
|
| 659 |
if ref_text is None:
|
| 660 |
if self._asr_pipe is None:
|
|
|
|
| 1076 |
|
| 1077 |
# Build text tokens
|
| 1078 |
full_text = _combine_text(ref_text=ref_text, text=text)
|
| 1079 |
+
wrapped_text = f"<|text_start|>{full_text}<|text_end|>"
|
| 1080 |
text_tokens = (
|
| 1081 |
+
_tokenize_with_nonverbal_tags(wrapped_text, self.text_tokenizer)
|
| 1082 |
+
.repeat(self.config.num_audio_codebook, 1)
|
|
|
|
|
|
|
|
|
|
| 1083 |
.unsqueeze(0)
|
| 1084 |
).to(
|
| 1085 |
self.device
|
|
|
|
| 1491 |
return timesteps
|
| 1492 |
|
| 1493 |
|
| 1494 |
+
_NONVERBAL_PATTERN = re.compile(
|
| 1495 |
+
r"\[(laughter|sigh|confirmation-en|question-en|question-ah|question-oh|"
|
| 1496 |
+
r"question-ei|question-yi|surprise-ah|surprise-oh|surprise-wa|"
|
| 1497 |
+
r"surprise-yo|dissatisfaction-hnn)\]"
|
| 1498 |
+
)
|
| 1499 |
+
|
| 1500 |
+
|
| 1501 |
+
def _tokenize_with_nonverbal_tags(text: str, tokenizer) -> torch.Tensor:
|
| 1502 |
+
"""Tokenize text containing non-verbal tags, handling each tag independently.
|
| 1503 |
+
|
| 1504 |
+
Non-verbal tags are tokenized standalone to guarantee consistent token
|
| 1505 |
+
IDs regardless of surrounding language context (Chinese, English, etc.).
|
| 1506 |
+
|
| 1507 |
+
Args:
|
| 1508 |
+
text: Full text string potentially containing non-verbal tags.
|
| 1509 |
+
tokenizer: HuggingFace text tokenizer instance.
|
| 1510 |
+
Returns:
|
| 1511 |
+
Token IDs tensor of shape (1, seq_len).
|
| 1512 |
+
"""
|
| 1513 |
+
parts = []
|
| 1514 |
+
last_end = 0
|
| 1515 |
+
for m in _NONVERBAL_PATTERN.finditer(text):
|
| 1516 |
+
if m.start() > last_end:
|
| 1517 |
+
segment = text[last_end : m.start()]
|
| 1518 |
+
ids = tokenizer(segment, add_special_tokens=False).input_ids
|
| 1519 |
+
if ids:
|
| 1520 |
+
parts.append(ids)
|
| 1521 |
+
tag_ids = tokenizer(m.group(), add_special_tokens=False).input_ids
|
| 1522 |
+
if tag_ids:
|
| 1523 |
+
parts.append(tag_ids)
|
| 1524 |
+
last_end = m.end()
|
| 1525 |
+
if last_end < len(text):
|
| 1526 |
+
segment = text[last_end:]
|
| 1527 |
+
ids = tokenizer(segment, add_special_tokens=False).input_ids
|
| 1528 |
+
if ids:
|
| 1529 |
+
parts.append(ids)
|
| 1530 |
+
|
| 1531 |
+
if not parts:
|
| 1532 |
+
result = tokenizer(text, return_tensors="pt").input_ids
|
| 1533 |
+
else:
|
| 1534 |
+
combined = []
|
| 1535 |
+
for p in parts:
|
| 1536 |
+
combined.extend(p)
|
| 1537 |
+
result = torch.tensor([combined], dtype=torch.long)
|
| 1538 |
+
return result
|
| 1539 |
+
|
| 1540 |
+
|
| 1541 |
def _combine_text(text, ref_text: Optional[str] = None) -> str:
|
| 1542 |
|
| 1543 |
# combine with reference text if not None
|
|
|
|
| 1546 |
else:
|
| 1547 |
full_text = text.strip()
|
| 1548 |
|
| 1549 |
+
# filter out newline / carriage-return characters
|
| 1550 |
+
full_text = re.sub(r"[\r\n]+", "", full_text)
|
| 1551 |
+
|
| 1552 |
+
# collapse consecutive spaces / tabs into a single space
|
| 1553 |
+
full_text = re.sub(r"[ \t]+", " ", full_text)
|
| 1554 |
|
| 1555 |
# remove spaces around chinese characters
|
| 1556 |
chinese_range = r"[\u4e00-\u9fff]"
|
| 1557 |
pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})"
|
| 1558 |
full_text = re.sub(pattern, "", full_text)
|
| 1559 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1560 |
return full_text
|
| 1561 |
|
| 1562 |
|
omnivoice/utils/audio.py
CHANGED
|
@@ -42,7 +42,9 @@ def load_audio(audio_path: str, sampling_rate: int):
|
|
| 42 |
PyTorch tensor of shape (1, T)
|
| 43 |
"""
|
| 44 |
try:
|
| 45 |
-
waveform, prompt_sampling_rate = torchaudio.load(
|
|
|
|
|
|
|
| 46 |
except (RuntimeError, OSError):
|
| 47 |
# Fallback via pydub+ffmpeg for formats torchaudio can't handle
|
| 48 |
aseg = AudioSegment.from_file(audio_path)
|
|
|
|
| 42 |
PyTorch tensor of shape (1, T)
|
| 43 |
"""
|
| 44 |
try:
|
| 45 |
+
waveform, prompt_sampling_rate = torchaudio.load(
|
| 46 |
+
audio_path, backend="soundfile"
|
| 47 |
+
)
|
| 48 |
except (RuntimeError, OSError):
|
| 49 |
# Fallback via pydub+ffmpeg for formats torchaudio can't handle
|
| 50 |
aseg = AudioSegment.from_file(audio_path)
|