| import io |
| import re |
| import base64 |
| import numpy as np |
| import traceback |
| from typing import Union |
|
|
| from TTS.utils.synthesizer import Synthesizer |
| from aksharamukha.transliterate import process as aksharamukha_xlit |
| from scipy.io.wavfile import write as scipy_wav_write |
|
|
| import nltk |
| import pysbd |
|
|
| from .models.common import Language |
| from .models.request import TTSRequest |
| from .models.response import AudioFile, AudioConfig, TTSResponse, TTSFailureResponse |
| from .utils.text import TextNormalizer |
| from .utils.paragraph_handler import ParagraphHandler |
| from src.postprocessor import PostProcessor |
|
|
| class TextToSpeechEngine: |
| def __init__( |
| self, |
| models: dict, |
| allow_transliteration: bool = True, |
| enable_denoiser: bool = True, |
| ): |
| self.models = models |
| |
| |
| code_mixed_found = False |
| if allow_transliteration: |
| |
| from ai4bharat.transliteration import XlitEngine |
| xlit_langs = set() |
| |
| for lang in list(models): |
| if lang == 'en': |
| continue |
| |
| if '+' in lang: |
| |
| |
| lang = lang.split('+')[1] |
| code_mixed_found = True |
| xlit_langs.add(lang) |
| |
| self.xlit_engine = XlitEngine(xlit_langs, beam_width=6) |
| else: |
| self.xlit_engine = None |
|
|
| self.text_normalizer = TextNormalizer() |
| self.paragraph_handler = ParagraphHandler() |
| self.sent_seg = pysbd.Segmenter(language="en", clean=True) |
|
|
| self.orig_sr = 22050 |
| self.enable_denoiser = enable_denoiser |
| if enable_denoiser: |
| from src.postprocessor import Denoiser |
| self.target_sr = 16000 |
| self.denoiser = Denoiser(self.orig_sr, self.target_sr) |
| else: |
| self.target_sr = self.orig_sr |
| |
| self.post_processor = PostProcessor(self.target_sr) |
|
|
| if code_mixed_found: |
| |
| import enchant |
| from enchant.tokenize import get_tokenizer |
|
|
| self.enchant_dicts = { |
| "en_US": enchant.Dict("en_US"), |
| "en_GB": enchant.Dict("en_GB"), |
| } |
| self.enchant_tokenizer = get_tokenizer("en") |
|
|
| def concatenate_chunks(self, wav: np.ndarray, wav_chunk: np.ndarray): |
| |
| if type(wav_chunk) != np.ndarray: |
| wav_chunk = np.array(wav_chunk) |
| if wav is None: |
| return wav_chunk |
| return np.concatenate([wav, wav_chunk]) |
|
|
| def infer_from_request( |
| self, |
| request: TTSRequest, |
| transliterate_roman_to_native: bool = True |
| ) -> TTSResponse: |
|
|
| config = request.config |
| lang = config.language.sourceLanguage |
| gender = config.gender |
|
|
| |
| if lang == "en" and lang not in self.models and "en+hi" in self.models: |
| lang = "en+hi" |
|
|
| if lang not in self.models: |
| return TTSFailureResponse(status_text="Unsupported language!") |
| |
| if lang == "brx" and gender == "male": |
| return TTSFailureResponse(status_text="Sorry, `male` speaker not supported for this language!") |
| |
| output_list = [] |
|
|
| for sentence in request.input: |
| raw_audio = self.infer_from_text(sentence.source, lang, gender, transliterate_roman_to_native=transliterate_roman_to_native) |
| |
| byte_io = io.BytesIO() |
| scipy_wav_write(byte_io, self.target_sr, raw_audio) |
| |
| encoded_bytes = base64.b64encode(byte_io.read()) |
| encoded_string = encoded_bytes.decode() |
| speech_response = AudioFile(audioContent=encoded_string) |
| |
| output_list.append(speech_response) |
|
|
| audio_config = AudioConfig(language=Language(sourceLanguage=lang)) |
| return TTSResponse(audio=output_list, config=audio_config) |
| |
| def infer_from_text( |
| self, |
| input_text: str, |
| lang: str, |
| speaker_name: str, |
| transliterate_roman_to_native: bool = True |
| ) -> np.ndarray: |
| |
| |
| if lang == "en" and lang not in self.models and "en+hi" in self.models: |
| lang = "en+hi" |
| |
| input_text, primary_lang, secondary_lang = self.parse_langs_normalise_text(input_text, lang) |
|
|
| wav = None |
| paragraphs = self.paragraph_handler.split_text(input_text) |
|
|
| for paragraph in paragraphs: |
| paragraph = self.handle_transliteration(paragraph, primary_lang, transliterate_roman_to_native) |
| paras = [] |
| for sent in self.sent_seg.segment(paragraph): |
| if sent.strip() and not re.match(r'^[_\W]+$', sent.strip()): |
| paras.append(sent.strip()) |
| paragraph = " ".join(paras) |
| |
| |
| wav_chunk = self.models[lang].tts(paragraph, speaker_name=speaker_name, style_wav="") |
|
|
| wav_chunk = self.postprocess_audio(wav_chunk, primary_lang, speaker_name) |
| |
| wav = self.concatenate_chunks(wav, wav_chunk) |
| return wav |
| |
| def parse_langs_normalise_text(self, input_text: str, lang: str) -> Union[str, str, str]: |
| |
| if lang == "en" and lang not in self.models and "en+hi" in self.models: |
| lang = "en+hi" |
|
|
| if lang == "en+hi": |
| primary_lang, secondary_lang = lang.split('+') |
| else: |
| primary_lang = lang |
| secondary_lang = None |
|
|
| input_text = self.text_normalizer.normalize_text(input_text, primary_lang) |
| if secondary_lang: |
| |
| input_text = self.transliterate_native_words_using_spell_checker(input_text, secondary_lang) |
|
|
| return input_text, primary_lang, secondary_lang |
| |
| def handle_transliteration(self, input_text: str, primary_lang: str, transliterate_roman_to_native: bool) -> str: |
| if transliterate_roman_to_native and primary_lang != 'en': |
| input_text = self.transliterate_sentence(input_text, primary_lang) |
|
|
| |
| |
| if primary_lang == "mni": |
| |
| input_text = aksharamukha_xlit("MeeteiMayek", "Bengali", input_text) |
| return input_text |
| |
| def preprocess_text( |
| self, |
| input_text: str, |
| lang: str, |
| |
| transliterate_roman_to_native: bool = True |
| ) -> np.ndarray: |
|
|
| input_text, primary_lang, secondary_lang = self.parse_langs_normalise_text(input_text, lang) |
| input_text = self.handle_transliteration(input_text, primary_lang, transliterate_roman_to_native) |
| return input_text |
|
|
| def postprocess_audio(self, wav_chunk, primary_lang, speaker_name): |
| if self.enable_denoiser: |
| wav_chunk = self.denoiser.denoise(wav_chunk) |
| wav_chunk = self.post_processor.process(wav_chunk, primary_lang, speaker_name) |
| return wav_chunk |
|
|
| def transliterate_native_words_using_spell_checker(self, input_text, lang): |
| tokens = [result[0] for result in self.enchant_tokenizer(input_text)] |
| pos_tags = [result[1] for result in nltk.tag.pos_tag(tokens)] |
|
|
| |
| for word, pos_tag in zip(tokens, pos_tags): |
| if pos_tag == "NNP" or pos_tag == "NNPS": |
| |
| |
| |
| pass |
| elif self.enchant_dicts["en_US"].check(word) or self.enchant_dicts["en_GB"].check(word): |
| |
| continue |
| |
| |
| word = word.split("'")[0] |
|
|
| transliterated_word = self.transliterate_sentence(word, lang) |
| input_text = input_text.replace(word, transliterated_word, 1) |
| return input_text |
|
|
| def transliterate_sentence(self, input_text, lang): |
| if not self.xlit_engine: |
| return input_text |
|
|
| if lang == "raj": |
| lang = "hi" |
| |
| return self.xlit_engine.translit_sentence(input_text, lang) |
|
|