| from typing import Dict |
| import librosa |
| import numpy as np |
| import torch |
| import pyewts |
| import noisereduce as nr |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
| from num2tib.core import convert |
| from num2tib.core import convert2text |
| import re |
| converter = pyewts.pyewts() |
| def replace_numbers_with_convert(sentence, wylie=True): |
| pattern = r'\d+(\.\d+)?' |
| def replace(match): |
| return convert(match.group(), wylie) |
| result = re.sub(pattern, replace, sentence) |
| |
| return result |
|
|
| def cleanup_text(inputs): |
| for src, dst in replacements: |
| inputs = inputs.replace(src, dst) |
| return inputs |
|
|
| speaker_embeddings = { |
| "Lhasa(female)": "female_2.npy", |
|
|
| } |
|
|
| replacements = [ |
| ('_', '_'), |
| ('*', 'v'), |
| ('`', ';'), |
| ('~', ','), |
| ('+', ','), |
| ('\\', ';'), |
| ('|', ';'), |
| ('β',''), |
| ('β','') |
| ] |
|
|
|
|
|
|
|
|
|
|
| class EndpointHandler(): |
| def __init__(self, path=""): |
| |
| self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b") |
| self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b") |
| self.model.to('cuda') |
| self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
|
|
| def __call__(self, data: Dict[str]) -> Dict[str, str]: |
| """ |
| Args: |
| data (:obj:): |
| includes the deserialized audio file as bytes |
| Return: |
| A :obj:`dict`:. base64 encoded image |
| """ |
| |
|
|
| if len(text.strip()) == 0: |
| return (16000, np.zeros(0).astype(np.int16)) |
| text = converter.toWylie(text) |
| text=cleanup_text(text) |
| text=replace_numbers_with_convert(text) |
| inputs = self.processor(text=text, return_tensors="pt") |
| |
| input_ids = inputs["input_ids"] |
| input_ids = input_ids[..., :self.model.config.max_text_positions] |
| speaker_embedding = np.load(speaker_embeddings['Lhasa(female)']) |
| speaker_embedding = torch.tensor(speaker_embedding) |
| speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=vocoder.to('cuda')) |
| speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000) |
| return (16000, speech) |
|
|