Spaces:
Running on Zero
Running on Zero
update to 0.1.2 version
Browse files- app.py +7 -0
- omnivoice/cli/demo.py +2 -2
- omnivoice/models/omnivoice.py +26 -1
- omnivoice/training/trainer.py +11 -0
app.py
CHANGED
|
@@ -4,9 +4,16 @@ HuggingFace Space entry point for OmniVoice demo.
|
|
| 4 |
|
| 5 |
"""
|
| 6 |
|
|
|
|
| 7 |
import os
|
| 8 |
from typing import Any, Dict
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
import numpy as np
|
| 11 |
import spaces
|
| 12 |
import torch
|
|
|
|
| 4 |
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
import logging
|
| 8 |
import os
|
| 9 |
from typing import Any, Dict
|
| 10 |
|
| 11 |
+
logging.basicConfig(
|
| 12 |
+
level=logging.DEBUG,
|
| 13 |
+
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
|
| 14 |
+
)
|
| 15 |
+
logging.getLogger("omnivoice").setLevel(logging.DEBUG)
|
| 16 |
+
|
| 17 |
import numpy as np
|
| 18 |
import spaces
|
| 19 |
import torch
|
omnivoice/cli/demo.py
CHANGED
|
@@ -243,8 +243,8 @@ def build_demo(
|
|
| 243 |
def _gen_settings():
|
| 244 |
with gr.Accordion("Generation Settings (optional)", open=False):
|
| 245 |
sp = gr.Slider(
|
| 246 |
-
0.
|
| 247 |
-
1.
|
| 248 |
value=1.0,
|
| 249 |
step=0.05,
|
| 250 |
label="Speed",
|
|
|
|
| 243 |
def _gen_settings():
|
| 244 |
with gr.Accordion("Generation Settings (optional)", open=False):
|
| 245 |
sp = gr.Slider(
|
| 246 |
+
0.5,
|
| 247 |
+
1.5,
|
| 248 |
value=1.0,
|
| 249 |
step=0.05,
|
| 250 |
label="Speed",
|
omnivoice/models/omnivoice.py
CHANGED
|
@@ -1056,7 +1056,7 @@ class OmniVoice(PreTrainedModel):
|
|
| 1056 |
# Build style tokens: <|denoise|> + <|lang_start|>...<|lang_end|>
|
| 1057 |
# + <|instruct_start|>...<|instruct_end|>
|
| 1058 |
style_text = ""
|
| 1059 |
-
if denoise:
|
| 1060 |
style_text += "<|denoise|>"
|
| 1061 |
lang_str = lang if lang else "None"
|
| 1062 |
instruct_str = instruct if instruct else "None"
|
|
@@ -1131,6 +1131,17 @@ class OmniVoice(PreTrainedModel):
|
|
| 1131 |
|
| 1132 |
B = task.batch_size
|
| 1133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1134 |
inputs_list = [
|
| 1135 |
self._prepare_inference_inputs(
|
| 1136 |
task.texts[i],
|
|
@@ -1173,6 +1184,9 @@ class OmniVoice(PreTrainedModel):
|
|
| 1173 |
batch_input_ids[B + i, :, :u_len] = inp["input_ids"][..., -u_len:]
|
| 1174 |
batch_audio_mask[B + i, :u_len] = inp["audio_mask"][..., -u_len:]
|
| 1175 |
batch_attention_mask[B + i, :, :u_len, :u_len] = True
|
|
|
|
|
|
|
|
|
|
| 1176 |
|
| 1177 |
tokens = torch.full(
|
| 1178 |
(B, self.config.num_audio_codebook, max(task.target_lens)),
|
|
@@ -1491,6 +1505,17 @@ def _combine_text(text, ref_text: Optional[str] = None) -> str:
|
|
| 1491 |
chinese_range = r"[\u4e00-\u9fff]"
|
| 1492 |
pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})"
|
| 1493 |
full_text = re.sub(pattern, "", full_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1494 |
return full_text
|
| 1495 |
|
| 1496 |
|
|
|
|
| 1056 |
# Build style tokens: <|denoise|> + <|lang_start|>...<|lang_end|>
|
| 1057 |
# + <|instruct_start|>...<|instruct_end|>
|
| 1058 |
style_text = ""
|
| 1059 |
+
if denoise and ref_audio_tokens is not None:
|
| 1060 |
style_text += "<|denoise|>"
|
| 1061 |
lang_str = lang if lang else "None"
|
| 1062 |
instruct_str = instruct if instruct else "None"
|
|
|
|
| 1131 |
|
| 1132 |
B = task.batch_size
|
| 1133 |
|
| 1134 |
+
for i in range(B):
|
| 1135 |
+
logger.debug(
|
| 1136 |
+
"Item %d — text: %s | ref_text: %s | instruct: %s | lang: %s | target_tokens: %d",
|
| 1137 |
+
i,
|
| 1138 |
+
task.texts[i],
|
| 1139 |
+
task.ref_texts[i],
|
| 1140 |
+
task.instructs[i],
|
| 1141 |
+
task.langs[i],
|
| 1142 |
+
task.target_lens[i],
|
| 1143 |
+
)
|
| 1144 |
+
|
| 1145 |
inputs_list = [
|
| 1146 |
self._prepare_inference_inputs(
|
| 1147 |
task.texts[i],
|
|
|
|
| 1184 |
batch_input_ids[B + i, :, :u_len] = inp["input_ids"][..., -u_len:]
|
| 1185 |
batch_audio_mask[B + i, :u_len] = inp["audio_mask"][..., -u_len:]
|
| 1186 |
batch_attention_mask[B + i, :, :u_len, :u_len] = True
|
| 1187 |
+
if max_c_len > u_len:
|
| 1188 |
+
pad_diag = torch.arange(u_len, max_c_len, device=self.device)
|
| 1189 |
+
batch_attention_mask[B + i, :, pad_diag, pad_diag] = True
|
| 1190 |
|
| 1191 |
tokens = torch.full(
|
| 1192 |
(B, self.config.num_audio_codebook, max(task.target_lens)),
|
|
|
|
| 1505 |
chinese_range = r"[\u4e00-\u9fff]"
|
| 1506 |
pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})"
|
| 1507 |
full_text = re.sub(pattern, "", full_text)
|
| 1508 |
+
|
| 1509 |
+
# Remove whitespace immediately before special emotion tags (except
|
| 1510 |
+
# [laughter]). During training these tags have no preceding space, so
|
| 1511 |
+
# the text tokenizer would mis-tokenise them if spaces were present.
|
| 1512 |
+
_EMOTION_TAGS = (
|
| 1513 |
+
r"sigh|confirmation-en|question-en|question-ah|question-oh|"
|
| 1514 |
+
r"question-ei|question-yi|surprise-ah|surprise-oh|surprise-wa|"
|
| 1515 |
+
r"surprise-yo|dissatisfaction-hnn"
|
| 1516 |
+
)
|
| 1517 |
+
full_text = re.sub(rf"\s+(\[({_EMOTION_TAGS})\])", r"\1", full_text)
|
| 1518 |
+
|
| 1519 |
return full_text
|
| 1520 |
|
| 1521 |
|
omnivoice/training/trainer.py
CHANGED
|
@@ -45,6 +45,14 @@ from omnivoice.training.checkpoint import save_checkpoint as engine_save_checkpo
|
|
| 45 |
logger = logging.getLogger(__name__)
|
| 46 |
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
class OmniTrainer:
|
| 49 |
def __init__(
|
| 50 |
self,
|
|
@@ -211,6 +219,7 @@ class OmniTrainer:
|
|
| 211 |
|
| 212 |
with torch.no_grad():
|
| 213 |
for eval_batch in self.eval_dataloader:
|
|
|
|
| 214 |
outputs = self.model(**eval_batch)
|
| 215 |
local_loss_sum += outputs.loss.detach()
|
| 216 |
eval_count += 1
|
|
@@ -269,6 +278,8 @@ class OmniTrainer:
|
|
| 269 |
train_iterator = iter(self.train_dataloader)
|
| 270 |
batch = next(train_iterator)
|
| 271 |
|
|
|
|
|
|
|
| 272 |
with self.accelerator.accumulate(self.model):
|
| 273 |
outputs = self.model(**batch)
|
| 274 |
loss = outputs.loss
|
|
|
|
| 45 |
logger = logging.getLogger(__name__)
|
| 46 |
|
| 47 |
|
| 48 |
+
def _to_device(batch, device):
|
| 49 |
+
"""Move all tensors in a batch dict to the target device."""
|
| 50 |
+
return {
|
| 51 |
+
k: v.to(device, non_blocking=True) if isinstance(v, torch.Tensor) else v
|
| 52 |
+
for k, v in batch.items()
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
class OmniTrainer:
|
| 57 |
def __init__(
|
| 58 |
self,
|
|
|
|
| 219 |
|
| 220 |
with torch.no_grad():
|
| 221 |
for eval_batch in self.eval_dataloader:
|
| 222 |
+
eval_batch = _to_device(eval_batch, self.accelerator.device)
|
| 223 |
outputs = self.model(**eval_batch)
|
| 224 |
local_loss_sum += outputs.loss.detach()
|
| 225 |
eval_count += 1
|
|
|
|
| 278 |
train_iterator = iter(self.train_dataloader)
|
| 279 |
batch = next(train_iterator)
|
| 280 |
|
| 281 |
+
batch = _to_device(batch, self.accelerator.device)
|
| 282 |
+
|
| 283 |
with self.accelerator.accumulate(self.model):
|
| 284 |
outputs = self.model(**batch)
|
| 285 |
loss = outputs.loss
|