zhu-han commited on
Commit
c1079c1
·
1 Parent(s): 6179998

update to 0.1.2 version

Browse files
app.py CHANGED
@@ -4,9 +4,16 @@ HuggingFace Space entry point for OmniVoice demo.
4
 
5
  """
6
 
 
7
  import os
8
  from typing import Any, Dict
9
 
 
 
 
 
 
 
10
  import numpy as np
11
  import spaces
12
  import torch
 
4
 
5
  """
6
 
7
+ import logging
8
  import os
9
  from typing import Any, Dict
10
 
11
+ logging.basicConfig(
12
+ level=logging.DEBUG,
13
+ format="%(asctime)s %(name)s %(levelname)s: %(message)s",
14
+ )
15
+ logging.getLogger("omnivoice").setLevel(logging.DEBUG)
16
+
17
  import numpy as np
18
  import spaces
19
  import torch
omnivoice/cli/demo.py CHANGED
@@ -243,8 +243,8 @@ def build_demo(
243
  def _gen_settings():
244
  with gr.Accordion("Generation Settings (optional)", open=False):
245
  sp = gr.Slider(
246
- 0.7,
247
- 1.3,
248
  value=1.0,
249
  step=0.05,
250
  label="Speed",
 
243
  def _gen_settings():
244
  with gr.Accordion("Generation Settings (optional)", open=False):
245
  sp = gr.Slider(
246
+ 0.5,
247
+ 1.5,
248
  value=1.0,
249
  step=0.05,
250
  label="Speed",
omnivoice/models/omnivoice.py CHANGED
@@ -1056,7 +1056,7 @@ class OmniVoice(PreTrainedModel):
1056
  # Build style tokens: <|denoise|> + <|lang_start|>...<|lang_end|>
1057
  # + <|instruct_start|>...<|instruct_end|>
1058
  style_text = ""
1059
- if denoise:
1060
  style_text += "<|denoise|>"
1061
  lang_str = lang if lang else "None"
1062
  instruct_str = instruct if instruct else "None"
@@ -1131,6 +1131,17 @@ class OmniVoice(PreTrainedModel):
1131
 
1132
  B = task.batch_size
1133
 
 
 
 
 
 
 
 
 
 
 
 
1134
  inputs_list = [
1135
  self._prepare_inference_inputs(
1136
  task.texts[i],
@@ -1173,6 +1184,9 @@ class OmniVoice(PreTrainedModel):
1173
  batch_input_ids[B + i, :, :u_len] = inp["input_ids"][..., -u_len:]
1174
  batch_audio_mask[B + i, :u_len] = inp["audio_mask"][..., -u_len:]
1175
  batch_attention_mask[B + i, :, :u_len, :u_len] = True
 
 
 
1176
 
1177
  tokens = torch.full(
1178
  (B, self.config.num_audio_codebook, max(task.target_lens)),
@@ -1491,6 +1505,17 @@ def _combine_text(text, ref_text: Optional[str] = None) -> str:
1491
  chinese_range = r"[\u4e00-\u9fff]"
1492
  pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})"
1493
  full_text = re.sub(pattern, "", full_text)
 
 
 
 
 
 
 
 
 
 
 
1494
  return full_text
1495
 
1496
 
 
1056
  # Build style tokens: <|denoise|> + <|lang_start|>...<|lang_end|>
1057
  # + <|instruct_start|>...<|instruct_end|>
1058
  style_text = ""
1059
+ if denoise and ref_audio_tokens is not None:
1060
  style_text += "<|denoise|>"
1061
  lang_str = lang if lang else "None"
1062
  instruct_str = instruct if instruct else "None"
 
1131
 
1132
  B = task.batch_size
1133
 
1134
+ for i in range(B):
1135
+ logger.debug(
1136
+ "Item %d — text: %s | ref_text: %s | instruct: %s | lang: %s | target_tokens: %d",
1137
+ i,
1138
+ task.texts[i],
1139
+ task.ref_texts[i],
1140
+ task.instructs[i],
1141
+ task.langs[i],
1142
+ task.target_lens[i],
1143
+ )
1144
+
1145
  inputs_list = [
1146
  self._prepare_inference_inputs(
1147
  task.texts[i],
 
1184
  batch_input_ids[B + i, :, :u_len] = inp["input_ids"][..., -u_len:]
1185
  batch_audio_mask[B + i, :u_len] = inp["audio_mask"][..., -u_len:]
1186
  batch_attention_mask[B + i, :, :u_len, :u_len] = True
1187
+ if max_c_len > u_len:
1188
+ pad_diag = torch.arange(u_len, max_c_len, device=self.device)
1189
+ batch_attention_mask[B + i, :, pad_diag, pad_diag] = True
1190
 
1191
  tokens = torch.full(
1192
  (B, self.config.num_audio_codebook, max(task.target_lens)),
 
1505
  chinese_range = r"[\u4e00-\u9fff]"
1506
  pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})"
1507
  full_text = re.sub(pattern, "", full_text)
1508
+
1509
+ # Remove whitespace immediately before special emotion tags (except
1510
+ # [laughter]). During training these tags have no preceding space, so
1511
+ # the text tokenizer would mis-tokenise them if spaces were present.
1512
+ _EMOTION_TAGS = (
1513
+ r"sigh|confirmation-en|question-en|question-ah|question-oh|"
1514
+ r"question-ei|question-yi|surprise-ah|surprise-oh|surprise-wa|"
1515
+ r"surprise-yo|dissatisfaction-hnn"
1516
+ )
1517
+ full_text = re.sub(rf"\s+(\[({_EMOTION_TAGS})\])", r"\1", full_text)
1518
+
1519
  return full_text
1520
 
1521
 
omnivoice/training/trainer.py CHANGED
@@ -45,6 +45,14 @@ from omnivoice.training.checkpoint import save_checkpoint as engine_save_checkpo
45
  logger = logging.getLogger(__name__)
46
 
47
 
 
 
 
 
 
 
 
 
48
  class OmniTrainer:
49
  def __init__(
50
  self,
@@ -211,6 +219,7 @@ class OmniTrainer:
211
 
212
  with torch.no_grad():
213
  for eval_batch in self.eval_dataloader:
 
214
  outputs = self.model(**eval_batch)
215
  local_loss_sum += outputs.loss.detach()
216
  eval_count += 1
@@ -269,6 +278,8 @@ class OmniTrainer:
269
  train_iterator = iter(self.train_dataloader)
270
  batch = next(train_iterator)
271
 
 
 
272
  with self.accelerator.accumulate(self.model):
273
  outputs = self.model(**batch)
274
  loss = outputs.loss
 
45
  logger = logging.getLogger(__name__)
46
 
47
 
48
+ def _to_device(batch, device):
49
+ """Move all tensors in a batch dict to the target device."""
50
+ return {
51
+ k: v.to(device, non_blocking=True) if isinstance(v, torch.Tensor) else v
52
+ for k, v in batch.items()
53
+ }
54
+
55
+
56
  class OmniTrainer:
57
  def __init__(
58
  self,
 
219
 
220
  with torch.no_grad():
221
  for eval_batch in self.eval_dataloader:
222
+ eval_batch = _to_device(eval_batch, self.accelerator.device)
223
  outputs = self.model(**eval_batch)
224
  local_loss_sum += outputs.loss.detach()
225
  eval_count += 1
 
278
  train_iterator = iter(self.train_dataloader)
279
  batch = next(train_iterator)
280
 
281
+ batch = _to_device(batch, self.accelerator.device)
282
+
283
  with self.accelerator.accumulate(self.model):
284
  outputs = self.model(**batch)
285
  loss = outputs.loss