Spaces:

k2-fsa
/

OmniVoice

Running on Zero

App Files Files Community

zhu-han commited on 5 days ago

Commit

32ffa33

1 Parent(s): 7be7394

update to 0.1.3 version

Browse files

Files changed (9) hide show

omnivoice/cli/demo.py +16 -7
omnivoice/eval/mos/utmos.py +0 -0
omnivoice/eval/speaker_similarity/sim.py +0 -0
omnivoice/eval/wer/fleurs.py +0 -0
omnivoice/eval/wer/hubert.py +0 -0
omnivoice/eval/wer/minimax.py +0 -0
omnivoice/eval/wer/seedtts.py +0 -0
omnivoice/models/omnivoice.py +66 -25
omnivoice/utils/audio.py +3 -1

omnivoice/cli/demo.py CHANGED Viewed

@@ -136,6 +136,13 @@ def build_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--share", action="store_true", default=False, help="Create public link."
     )
     return parser
@@ -198,9 +205,8 @@ def build_demo(
                 ref_text=ref_text,
             )
-        if mode == "design":
-            if instruct and instruct.strip():
-                kw["instruct"] = instruct.strip()
         try:
             audio = model.generate(**kw)
@@ -303,7 +309,7 @@ State-of-the-art text-to-speech model for **600+ languages**, supporting:
 - **Voice Design** — Create custom voices with speaker attributes
 Built with [OmniVoice](https://github.com/k2-fsa/OmniVoice)
-by Xiaomi Next-gen Kaldi team.
 """
         )
@@ -336,6 +342,8 @@ by Xiaomi Next-gen Kaldi team.
                             " to auto-transcribe via ASR models.",
                         )
                         vc_lang = _lang_dropdown("Language (optional) / 语种 (可选)")
                         (
                             vc_ns,
                             vc_gs,
@@ -354,13 +362,13 @@ by Xiaomi Next-gen Kaldi team.
                         vc_status = gr.Textbox(label="Status / 状态", lines=2)
                 def _clone_fn(
-                    text, lang, ref_aud, ref_text, ns, gs, dn, sp, du, pp, po
                 ):
                     return _gen(
                         text,
                         lang,
                         ref_aud,
-                        None,
                         ns,
                         gs,
                         dn,
@@ -379,6 +387,7 @@ by Xiaomi Next-gen Kaldi team.
                         vc_lang,
                         vc_ref_audio,
                         vc_ref_text,
                         vc_ns,
                         vc_gs,
                         vc_dn,
@@ -514,7 +523,7 @@ def main(argv=None) -> int:
         checkpoint,
         device_map=device,
         dtype=torch.float16,
-        load_asr=True,
     )
     print("Model loaded.")

     parser.add_argument(
         "--share", action="store_true", default=False, help="Create public link."
     )
+    parser.add_argument(
+        "--no-asr",
+        action="store_true",
+        default=False,
+        help="Skip loading Whisper ASR model. Reference text auto-transcription"
+        " will be unavailable.",
+    )
     return parser
                 ref_text=ref_text,
             )
+        if instruct and instruct.strip():
+            kw["instruct"] = instruct.strip()
         try:
             audio = model.generate(**kw)
 - **Voice Design** — Create custom voices with speaker attributes
 Built with [OmniVoice](https://github.com/k2-fsa/OmniVoice)
+by Xiaomi AI Lab Next-gen Kaldi team.
 """
         )
                             " to auto-transcribe via ASR models.",
                         )
                         vc_lang = _lang_dropdown("Language (optional) / 语种 (可选)")
+                        with gr.Accordion("Instruct (optional)", open=False):
+                            vc_instruct = gr.Textbox(label="Instruct", lines=2)
                         (
                             vc_ns,
                             vc_gs,
                         vc_status = gr.Textbox(label="Status / 状态", lines=2)
                 def _clone_fn(
+                    text, lang, ref_aud, ref_text, instruct, ns, gs, dn, sp, du, pp, po
                 ):
                     return _gen(
                         text,
                         lang,
                         ref_aud,
+                        instruct,
                         ns,
                         gs,
                         dn,
                         vc_lang,
                         vc_ref_audio,
                         vc_ref_text,
+                        vc_instruct,
                         vc_ns,
                         vc_gs,
                         vc_dn,
         checkpoint,
         device_map=device,
         dtype=torch.float16,
+        load_asr=not args.no_asr,
     )
     print("Model loaded.")

omnivoice/eval/mos/utmos.py CHANGED Viewed

File without changes

omnivoice/eval/speaker_similarity/sim.py CHANGED Viewed

File without changes

omnivoice/eval/wer/fleurs.py CHANGED Viewed

File without changes

omnivoice/eval/wer/hubert.py CHANGED Viewed

File without changes

omnivoice/eval/wer/minimax.py CHANGED Viewed

File without changes

omnivoice/eval/wer/seedtts.py CHANGED Viewed

File without changes

omnivoice/models/omnivoice.py CHANGED Viewed

@@ -630,15 +630,9 @@ class OmniVoice(PreTrainedModel):
             # Skip trimming when ref_text is user-provided, otherwise the
             # trimmed audio will no longer match the full transcript.
             if ref_text is None:
-                ref_wav = trim_long_audio(ref_wav, self.sampling_rate)
-            elif ref_wav.size(-1) / self.sampling_rate > 20.0:
-                logger.warning(
-                    "Reference audio is %.1fs long (>20s) and ref_text was "
-                    "provided, so automatic trimming is skipped. A long reference "
-                    "may cause slower generation and degraded quality.",
-                    ref_wav.size(-1) / self.sampling_rate,
                 )
             ref_wav = remove_silence(
                 ref_wav,
                 self.sampling_rate,
@@ -652,6 +646,15 @@ class OmniVoice(PreTrainedModel):
                     "Try setting preprocess_prompt=False."
                 )
         # Auto-transcribe if ref_text not provided
         if ref_text is None:
             if self._asr_pipe is None:
@@ -1073,12 +1076,10 @@ class OmniVoice(PreTrainedModel):
         # Build text tokens
         full_text = _combine_text(ref_text=ref_text, text=text)
         text_tokens = (
-            self.text_tokenizer(
-                f"<|text_start|>{full_text}<|text_end|>",
-                return_tensors="pt",
-            )
-            .input_ids.repeat(self.config.num_audio_codebook, 1)
             .unsqueeze(0)
         ).to(
             self.device
@@ -1490,6 +1491,53 @@ def _get_time_steps(
     return timesteps
 def _combine_text(text, ref_text: Optional[str] = None) -> str:
     # combine with reference text if not None
@@ -1498,24 +1546,17 @@ def _combine_text(text, ref_text: Optional[str] = None) -> str:
     else:
         full_text = text.strip()
-    # replace \n with .
-    full_text = re.sub(r"[ \t]*\r?\n[\s]*", ".", full_text)
     # remove spaces around chinese characters
     chinese_range = r"[\u4e00-\u9fff]"
     pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})"
     full_text = re.sub(pattern, "", full_text)
-    # Remove whitespace immediately before special emotion tags (except
-    # [laughter]).  During training these tags have no preceding space, so
-    # the text tokenizer would mis-tokenise them if spaces were present.
-    _EMOTION_TAGS = (
-        r"sigh|confirmation-en|question-en|question-ah|question-oh|"
-        r"question-ei|question-yi|surprise-ah|surprise-oh|surprise-wa|"
-        r"surprise-yo|dissatisfaction-hnn"
-    )
-    full_text = re.sub(rf"\s+(\[({_EMOTION_TAGS})\])", r"\1", full_text)
     return full_text

             # Skip trimming when ref_text is user-provided, otherwise the
             # trimmed audio will no longer match the full transcript.
             if ref_text is None:
+                ref_wav = trim_long_audio(
+                    ref_wav, self.sampling_rate, trim_threshold=20.0
                 )
             ref_wav = remove_silence(
                 ref_wav,
                 self.sampling_rate,
                     "Try setting preprocess_prompt=False."
                 )
+        ref_duration = ref_wav.size(-1) / self.sampling_rate
+        if ref_duration > 20.0:
+            logger.warning(
+                "Reference audio is %.1fs long (>20s). This may cause slower "
+                "generation, higher memory usage, and degraded voice cloning "
+                "quality. We recommend trimming it to 3-10s.",
+                ref_duration,
+            )
         # Auto-transcribe if ref_text not provided
         if ref_text is None:
             if self._asr_pipe is None:
         # Build text tokens
         full_text = _combine_text(ref_text=ref_text, text=text)
+        wrapped_text = f"<|text_start|>{full_text}<|text_end|>"
         text_tokens = (
+            _tokenize_with_nonverbal_tags(wrapped_text, self.text_tokenizer)
+            .repeat(self.config.num_audio_codebook, 1)
             .unsqueeze(0)
         ).to(
             self.device
     return timesteps
+_NONVERBAL_PATTERN = re.compile(
+    r"\[(laughter|sigh|confirmation-en|question-en|question-ah|question-oh|"
+    r"question-ei|question-yi|surprise-ah|surprise-oh|surprise-wa|"
+    r"surprise-yo|dissatisfaction-hnn)\]"
+)
+def _tokenize_with_nonverbal_tags(text: str, tokenizer) -> torch.Tensor:
+    """Tokenize text containing non-verbal tags, handling each tag independently.
+    Non-verbal tags are tokenized standalone to guarantee consistent token
+    IDs regardless of surrounding language context (Chinese, English, etc.).
+    Args:
+        text: Full text string potentially containing non-verbal tags.
+        tokenizer: HuggingFace text tokenizer instance.
+    Returns:
+        Token IDs tensor of shape (1, seq_len).
+    """
+    parts = []
+    last_end = 0
+    for m in _NONVERBAL_PATTERN.finditer(text):
+        if m.start() > last_end:
+            segment = text[last_end : m.start()]
+            ids = tokenizer(segment, add_special_tokens=False).input_ids
+            if ids:
+                parts.append(ids)
+        tag_ids = tokenizer(m.group(), add_special_tokens=False).input_ids
+        if tag_ids:
+            parts.append(tag_ids)
+        last_end = m.end()
+    if last_end < len(text):
+        segment = text[last_end:]
+        ids = tokenizer(segment, add_special_tokens=False).input_ids
+        if ids:
+            parts.append(ids)
+    if not parts:
+        result = tokenizer(text, return_tensors="pt").input_ids
+    else:
+        combined = []
+        for p in parts:
+            combined.extend(p)
+        result = torch.tensor([combined], dtype=torch.long)
+    return result
 def _combine_text(text, ref_text: Optional[str] = None) -> str:
     # combine with reference text if not None
     else:
         full_text = text.strip()
+    # filter out newline / carriage-return characters
+    full_text = re.sub(r"[\r\n]+", "", full_text)
+    # collapse consecutive spaces / tabs into a single space
+    full_text = re.sub(r"[ \t]+", " ", full_text)
     # remove spaces around chinese characters
     chinese_range = r"[\u4e00-\u9fff]"
     pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})"
     full_text = re.sub(pattern, "", full_text)
     return full_text

omnivoice/utils/audio.py CHANGED Viewed

@@ -42,7 +42,9 @@ def load_audio(audio_path: str, sampling_rate: int):
         PyTorch tensor of shape (1, T)
     """
     try:
-        waveform, prompt_sampling_rate = torchaudio.load(audio_path)
     except (RuntimeError, OSError):
         # Fallback via pydub+ffmpeg for formats torchaudio can't handle
         aseg = AudioSegment.from_file(audio_path)

         PyTorch tensor of shape (1, T)
     """
     try:
+        waveform, prompt_sampling_rate = torchaudio.load(
+            audio_path, backend="soundfile"
+        )
     except (RuntimeError, OSError):
         # Fallback via pydub+ffmpeg for formats torchaudio can't handle
         aseg = AudioSegment.from_file(audio_path)