Spaces:

openpecha
/

TTS

Runtime error

App Files Files Community

TTS / handle.py

TenzinGayche

Create handle.py

4629510 over 2 years ago

raw

history blame contribute delete

2.32 kB

	from typing import Dict
	import librosa
	import numpy as np
	import torch
	import pyewts
	import noisereduce as nr
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from num2tib.core import convert
	from num2tib.core import convert2text
	import re
	converter = pyewts.pyewts()
	def replace_numbers_with_convert(sentence, wylie=True):
	pattern = r'\d+(\.\d+)?'
	def replace(match):
	return convert(match.group(), wylie)
	result = re.sub(pattern, replace, sentence)

	return result

	def cleanup_text(inputs):
	for src, dst in replacements:
	inputs = inputs.replace(src, dst)
	return inputs

	speaker_embeddings = {
	"Lhasa(female)": "female_2.npy",

	}

	replacements = [
	('_', '_'),
	('*', 'v'),
	('`', ';'),
	('~', ','),
	('+', ','),
	('\\', ';'),
	('\|', ';'),
	('╚',''),
	('╗','')
	]





	class EndpointHandler():
	def __init__(self, path=""):
	# load the model
	self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
	self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
	self.model.to('cuda')
	self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


	def __call__(self, data: Dict[str]) -> Dict[str, str]:
	"""
	Args:
	data (:obj:):
	includes the deserialized audio file as bytes
	Return:
	A :obj:`dict`:. base64 encoded image
	"""
	# process input

	if len(text.strip()) == 0:
	return (16000, np.zeros(0).astype(np.int16))
	text = converter.toWylie(text)
	text=cleanup_text(text)
	text=replace_numbers_with_convert(text)
	inputs = self.processor(text=text, return_tensors="pt")
	# limit input length
	input_ids = inputs["input_ids"]
	input_ids = input_ids[..., :self.model.config.max_text_positions]
	speaker_embedding = np.load(speaker_embeddings['Lhasa(female)'])
	speaker_embedding = torch.tensor(speaker_embedding)
	speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=vocoder.to('cuda'))
	speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000)
	return (16000, speech)