zhu-han commited on
Commit
32ffa33
·
1 Parent(s): 7be7394

update to 0.1.3 version

Browse files
omnivoice/cli/demo.py CHANGED
@@ -136,6 +136,13 @@ def build_parser() -> argparse.ArgumentParser:
136
  parser.add_argument(
137
  "--share", action="store_true", default=False, help="Create public link."
138
  )
 
 
 
 
 
 
 
139
  return parser
140
 
141
 
@@ -198,9 +205,8 @@ def build_demo(
198
  ref_text=ref_text,
199
  )
200
 
201
- if mode == "design":
202
- if instruct and instruct.strip():
203
- kw["instruct"] = instruct.strip()
204
 
205
  try:
206
  audio = model.generate(**kw)
@@ -303,7 +309,7 @@ State-of-the-art text-to-speech model for **600+ languages**, supporting:
303
  - **Voice Design** — Create custom voices with speaker attributes
304
 
305
  Built with [OmniVoice](https://github.com/k2-fsa/OmniVoice)
306
- by Xiaomi Next-gen Kaldi team.
307
  """
308
  )
309
 
@@ -336,6 +342,8 @@ by Xiaomi Next-gen Kaldi team.
336
  " to auto-transcribe via ASR models.",
337
  )
338
  vc_lang = _lang_dropdown("Language (optional) / 语种 (可选)")
 
 
339
  (
340
  vc_ns,
341
  vc_gs,
@@ -354,13 +362,13 @@ by Xiaomi Next-gen Kaldi team.
354
  vc_status = gr.Textbox(label="Status / 状态", lines=2)
355
 
356
  def _clone_fn(
357
- text, lang, ref_aud, ref_text, ns, gs, dn, sp, du, pp, po
358
  ):
359
  return _gen(
360
  text,
361
  lang,
362
  ref_aud,
363
- None,
364
  ns,
365
  gs,
366
  dn,
@@ -379,6 +387,7 @@ by Xiaomi Next-gen Kaldi team.
379
  vc_lang,
380
  vc_ref_audio,
381
  vc_ref_text,
 
382
  vc_ns,
383
  vc_gs,
384
  vc_dn,
@@ -514,7 +523,7 @@ def main(argv=None) -> int:
514
  checkpoint,
515
  device_map=device,
516
  dtype=torch.float16,
517
- load_asr=True,
518
  )
519
  print("Model loaded.")
520
 
 
136
  parser.add_argument(
137
  "--share", action="store_true", default=False, help="Create public link."
138
  )
139
+ parser.add_argument(
140
+ "--no-asr",
141
+ action="store_true",
142
+ default=False,
143
+ help="Skip loading Whisper ASR model. Reference text auto-transcription"
144
+ " will be unavailable.",
145
+ )
146
  return parser
147
 
148
 
 
205
  ref_text=ref_text,
206
  )
207
 
208
+ if instruct and instruct.strip():
209
+ kw["instruct"] = instruct.strip()
 
210
 
211
  try:
212
  audio = model.generate(**kw)
 
309
  - **Voice Design** — Create custom voices with speaker attributes
310
 
311
  Built with [OmniVoice](https://github.com/k2-fsa/OmniVoice)
312
+ by Xiaomi AI Lab Next-gen Kaldi team.
313
  """
314
  )
315
 
 
342
  " to auto-transcribe via ASR models.",
343
  )
344
  vc_lang = _lang_dropdown("Language (optional) / 语种 (可选)")
345
+ with gr.Accordion("Instruct (optional)", open=False):
346
+ vc_instruct = gr.Textbox(label="Instruct", lines=2)
347
  (
348
  vc_ns,
349
  vc_gs,
 
362
  vc_status = gr.Textbox(label="Status / 状态", lines=2)
363
 
364
  def _clone_fn(
365
+ text, lang, ref_aud, ref_text, instruct, ns, gs, dn, sp, du, pp, po
366
  ):
367
  return _gen(
368
  text,
369
  lang,
370
  ref_aud,
371
+ instruct,
372
  ns,
373
  gs,
374
  dn,
 
387
  vc_lang,
388
  vc_ref_audio,
389
  vc_ref_text,
390
+ vc_instruct,
391
  vc_ns,
392
  vc_gs,
393
  vc_dn,
 
523
  checkpoint,
524
  device_map=device,
525
  dtype=torch.float16,
526
+ load_asr=not args.no_asr,
527
  )
528
  print("Model loaded.")
529
 
omnivoice/eval/mos/utmos.py CHANGED
File without changes
omnivoice/eval/speaker_similarity/sim.py CHANGED
File without changes
omnivoice/eval/wer/fleurs.py CHANGED
File without changes
omnivoice/eval/wer/hubert.py CHANGED
File without changes
omnivoice/eval/wer/minimax.py CHANGED
File without changes
omnivoice/eval/wer/seedtts.py CHANGED
File without changes
omnivoice/models/omnivoice.py CHANGED
@@ -630,15 +630,9 @@ class OmniVoice(PreTrainedModel):
630
  # Skip trimming when ref_text is user-provided, otherwise the
631
  # trimmed audio will no longer match the full transcript.
632
  if ref_text is None:
633
- ref_wav = trim_long_audio(ref_wav, self.sampling_rate)
634
- elif ref_wav.size(-1) / self.sampling_rate > 20.0:
635
- logger.warning(
636
- "Reference audio is %.1fs long (>20s) and ref_text was "
637
- "provided, so automatic trimming is skipped. A long reference "
638
- "may cause slower generation and degraded quality.",
639
- ref_wav.size(-1) / self.sampling_rate,
640
  )
641
-
642
  ref_wav = remove_silence(
643
  ref_wav,
644
  self.sampling_rate,
@@ -652,6 +646,15 @@ class OmniVoice(PreTrainedModel):
652
  "Try setting preprocess_prompt=False."
653
  )
654
 
 
 
 
 
 
 
 
 
 
655
  # Auto-transcribe if ref_text not provided
656
  if ref_text is None:
657
  if self._asr_pipe is None:
@@ -1073,12 +1076,10 @@ class OmniVoice(PreTrainedModel):
1073
 
1074
  # Build text tokens
1075
  full_text = _combine_text(ref_text=ref_text, text=text)
 
1076
  text_tokens = (
1077
- self.text_tokenizer(
1078
- f"<|text_start|>{full_text}<|text_end|>",
1079
- return_tensors="pt",
1080
- )
1081
- .input_ids.repeat(self.config.num_audio_codebook, 1)
1082
  .unsqueeze(0)
1083
  ).to(
1084
  self.device
@@ -1490,6 +1491,53 @@ def _get_time_steps(
1490
  return timesteps
1491
 
1492
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1493
  def _combine_text(text, ref_text: Optional[str] = None) -> str:
1494
 
1495
  # combine with reference text if not None
@@ -1498,24 +1546,17 @@ def _combine_text(text, ref_text: Optional[str] = None) -> str:
1498
  else:
1499
  full_text = text.strip()
1500
 
1501
- # replace \n with .
1502
- full_text = re.sub(r"[ \t]*\r?\n[\s]*", ".", full_text)
 
 
 
1503
 
1504
  # remove spaces around chinese characters
1505
  chinese_range = r"[\u4e00-\u9fff]"
1506
  pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})"
1507
  full_text = re.sub(pattern, "", full_text)
1508
 
1509
- # Remove whitespace immediately before special emotion tags (except
1510
- # [laughter]). During training these tags have no preceding space, so
1511
- # the text tokenizer would mis-tokenise them if spaces were present.
1512
- _EMOTION_TAGS = (
1513
- r"sigh|confirmation-en|question-en|question-ah|question-oh|"
1514
- r"question-ei|question-yi|surprise-ah|surprise-oh|surprise-wa|"
1515
- r"surprise-yo|dissatisfaction-hnn"
1516
- )
1517
- full_text = re.sub(rf"\s+(\[({_EMOTION_TAGS})\])", r"\1", full_text)
1518
-
1519
  return full_text
1520
 
1521
 
 
630
  # Skip trimming when ref_text is user-provided, otherwise the
631
  # trimmed audio will no longer match the full transcript.
632
  if ref_text is None:
633
+ ref_wav = trim_long_audio(
634
+ ref_wav, self.sampling_rate, trim_threshold=20.0
 
 
 
 
 
635
  )
 
636
  ref_wav = remove_silence(
637
  ref_wav,
638
  self.sampling_rate,
 
646
  "Try setting preprocess_prompt=False."
647
  )
648
 
649
+ ref_duration = ref_wav.size(-1) / self.sampling_rate
650
+ if ref_duration > 20.0:
651
+ logger.warning(
652
+ "Reference audio is %.1fs long (>20s). This may cause slower "
653
+ "generation, higher memory usage, and degraded voice cloning "
654
+ "quality. We recommend trimming it to 3-10s.",
655
+ ref_duration,
656
+ )
657
+
658
  # Auto-transcribe if ref_text not provided
659
  if ref_text is None:
660
  if self._asr_pipe is None:
 
1076
 
1077
  # Build text tokens
1078
  full_text = _combine_text(ref_text=ref_text, text=text)
1079
+ wrapped_text = f"<|text_start|>{full_text}<|text_end|>"
1080
  text_tokens = (
1081
+ _tokenize_with_nonverbal_tags(wrapped_text, self.text_tokenizer)
1082
+ .repeat(self.config.num_audio_codebook, 1)
 
 
 
1083
  .unsqueeze(0)
1084
  ).to(
1085
  self.device
 
1491
  return timesteps
1492
 
1493
 
1494
+ _NONVERBAL_PATTERN = re.compile(
1495
+ r"\[(laughter|sigh|confirmation-en|question-en|question-ah|question-oh|"
1496
+ r"question-ei|question-yi|surprise-ah|surprise-oh|surprise-wa|"
1497
+ r"surprise-yo|dissatisfaction-hnn)\]"
1498
+ )
1499
+
1500
+
1501
+ def _tokenize_with_nonverbal_tags(text: str, tokenizer) -> torch.Tensor:
1502
+ """Tokenize text containing non-verbal tags, handling each tag independently.
1503
+
1504
+ Non-verbal tags are tokenized standalone to guarantee consistent token
1505
+ IDs regardless of surrounding language context (Chinese, English, etc.).
1506
+
1507
+ Args:
1508
+ text: Full text string potentially containing non-verbal tags.
1509
+ tokenizer: HuggingFace text tokenizer instance.
1510
+ Returns:
1511
+ Token IDs tensor of shape (1, seq_len).
1512
+ """
1513
+ parts = []
1514
+ last_end = 0
1515
+ for m in _NONVERBAL_PATTERN.finditer(text):
1516
+ if m.start() > last_end:
1517
+ segment = text[last_end : m.start()]
1518
+ ids = tokenizer(segment, add_special_tokens=False).input_ids
1519
+ if ids:
1520
+ parts.append(ids)
1521
+ tag_ids = tokenizer(m.group(), add_special_tokens=False).input_ids
1522
+ if tag_ids:
1523
+ parts.append(tag_ids)
1524
+ last_end = m.end()
1525
+ if last_end < len(text):
1526
+ segment = text[last_end:]
1527
+ ids = tokenizer(segment, add_special_tokens=False).input_ids
1528
+ if ids:
1529
+ parts.append(ids)
1530
+
1531
+ if not parts:
1532
+ result = tokenizer(text, return_tensors="pt").input_ids
1533
+ else:
1534
+ combined = []
1535
+ for p in parts:
1536
+ combined.extend(p)
1537
+ result = torch.tensor([combined], dtype=torch.long)
1538
+ return result
1539
+
1540
+
1541
  def _combine_text(text, ref_text: Optional[str] = None) -> str:
1542
 
1543
  # combine with reference text if not None
 
1546
  else:
1547
  full_text = text.strip()
1548
 
1549
+ # filter out newline / carriage-return characters
1550
+ full_text = re.sub(r"[\r\n]+", "", full_text)
1551
+
1552
+ # collapse consecutive spaces / tabs into a single space
1553
+ full_text = re.sub(r"[ \t]+", " ", full_text)
1554
 
1555
  # remove spaces around chinese characters
1556
  chinese_range = r"[\u4e00-\u9fff]"
1557
  pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})"
1558
  full_text = re.sub(pattern, "", full_text)
1559
 
 
 
 
 
 
 
 
 
 
 
1560
  return full_text
1561
 
1562
 
omnivoice/utils/audio.py CHANGED
@@ -42,7 +42,9 @@ def load_audio(audio_path: str, sampling_rate: int):
42
  PyTorch tensor of shape (1, T)
43
  """
44
  try:
45
- waveform, prompt_sampling_rate = torchaudio.load(audio_path)
 
 
46
  except (RuntimeError, OSError):
47
  # Fallback via pydub+ffmpeg for formats torchaudio can't handle
48
  aseg = AudioSegment.from_file(audio_path)
 
42
  PyTorch tensor of shape (1, T)
43
  """
44
  try:
45
+ waveform, prompt_sampling_rate = torchaudio.load(
46
+ audio_path, backend="soundfile"
47
+ )
48
  except (RuntimeError, OSError):
49
  # Fallback via pydub+ffmpeg for formats torchaudio can't handle
50
  aseg = AudioSegment.from_file(audio_path)