| |
| |
| |
|
|
| import gradio as gr |
| import os |
| import re |
| import soundfile as sf |
|
|
| import json |
| import nltk |
| from underthesea import sent_tokenize as vie_sent_tokenize |
| from underthesea import text_normalize as vie_text_normalize |
| from nltk import sent_tokenize as nltk_sent_tokenize |
| from ttsmms import download |
| from ttsmms import TTS |
|
|
| from collections import OrderedDict |
| import uuid |
| import datetime |
| import shutil |
| from num2words import num2words |
|
|
|
|
| this_description = """Text To Speech for [1000+ languages](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) - using [fairseq MMS TTS](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper. |
| Please note that for some languages, it may not pronounce all words correctly (yet). |
| """ |
|
|
| nltk.download("punkt") |
|
|
| |
| tts_models = {} |
| eng_path = download("eng", "./data") |
| tts_models["eng"] = eng_path |
| vie_path = download("vie", "./data") |
| tts_models["vie"] = vie_path |
| mya_path = download("mya", "./data") |
| tts_models["mya"] = mya_path |
|
|
| lang_codes = OrderedDict() |
|
|
| language_names = list(lang_codes.keys()) |
| with open("lang_code.txt", "r") as file: |
| for line in file: |
| line = line.strip() |
| if line.startswith("----"): |
| continue |
| iso, lang = line.split("\t", 1) |
| lang_codes[lang + " (" + iso + ")"] = iso |
|
|
| language_names = list(lang_codes.keys()) |
|
|
| |
| with open("num2words_lang_map.json") as f: |
| num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict) |
|
|
|
|
| def convert_numbers_to_words_num2words(text, lang): |
| |
| numbers = re.findall(r"\d+", text) |
| |
| sorted_numbers = sorted(numbers, key=len, reverse=True) |
| print(sorted_numbers) |
|
|
| |
| for number in sorted_numbers: |
| number_word = num2words(int(number), lang=num2words_lang_map[lang][0]) |
| text = text.replace(number, number_word) |
|
|
| return text |
|
|
|
|
| def convert_mya_numbers_to_words(text): |
| from mm_num2word import mm_num2word, extract_num |
|
|
| numbers = extract_num(text) |
| sorted_numbers = sorted(numbers, key=len, reverse=True) |
| print(sorted_numbers) |
|
|
| for n in sorted_numbers: |
| text = text.replace(n, mm_num2word(n)) |
| return text |
|
|
|
|
| def prepare_sentences(text, lang="mya"): |
| sentences = [] |
| |
| if lang.lower() == "mya": |
| text = convert_mya_numbers_to_words(text) |
| text = text.replace("\u104A", ",").replace("\u104B", ".") |
|
|
| if lang in num2words_lang_map: |
| print("num2words supports this lang", lang) |
| text = convert_numbers_to_words_num2words(text, lang) |
| print("Processed text", text) |
|
|
| |
| text = text.lower() |
|
|
| paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()] |
|
|
| if lang.lower() == "vie": |
| for paragraph in paragraphs: |
| sentences_raw = vie_sent_tokenize(paragraph) |
| sentences.extend( |
| [ |
| vie_text_normalize(sentence) |
| for sentence in sentences_raw |
| if sentence.strip() |
| ] |
| ) |
| else: |
| sentences = [ |
| sentence |
| for paragraph in paragraphs |
| for sentence in nltk_sent_tokenize(paragraph) |
| if sentence.strip() |
| ] |
| return sentences |
|
|
|
|
| def list_dir(lang): |
| |
| current_dir = os.getcwd() |
| print(current_dir) |
|
|
| |
| files = os.listdir(current_dir) |
|
|
| |
| wav_files = [file for file in files if file.endswith(".wav")] |
| print("Total wav files:", len(wav_files)) |
|
|
| |
| sorted_list = sorted(wav_files) |
| print(lang, sorted_list[-1]) |
|
|
|
|
| def combine_wav(source_dir, stamp, lang): |
| |
| wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")] |
|
|
| |
| wav_files.sort() |
|
|
| |
| combined_data = [] |
| for file in wav_files: |
| file_path = os.path.join(source_dir, file) |
| data, sr = sf.read(file_path) |
| combined_data.extend(data) |
|
|
| |
| combined_file_path = f"{stamp}_{lang}.wav" |
| sf.write(combined_file_path, combined_data, sr) |
|
|
| shutil.rmtree(source_dir) |
| list_dir(lang) |
|
|
| |
| return combined_file_path |
|
|
|
|
| def mms_tts(Input_Text, lang_name="Burmese (mya)"): |
| |
| try: |
| lang_code = lang_codes[lang_name] |
| except KeyError: |
| lang_code = "mya" |
|
|
| user_model = download(lang_code, "./data") |
| tts = TTS(user_model) |
|
|
| sentences = prepare_sentences(Input_Text, lang_code) |
|
|
| |
| current_datetime = datetime.datetime.now() |
| timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f") |
|
|
| user_dir = f"u_{timestamp}" |
| if os.path.exists(user_dir): |
| session_id = str(uuid.uuid4()) |
| user_dir = f"u_{session_id}_{timestamp}" |
| os.makedirs(user_dir, exist_ok=True) |
| print("New user directory", user_dir) |
|
|
| for i, sentence in enumerate(sentences): |
| tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav") |
| combined_file_path = combine_wav(user_dir, timestamp, lang_code) |
| return combined_file_path |
|
|
|
|
| |
| iface = gr.Interface( |
| fn=mms_tts, |
| title="Massively Multilingual Speech (MMS) - Text To Speech", |
| description=this_description, |
| inputs=[ |
| gr.Textbox(lines=5, placeholder="Enter text (unlimited sentences)", label="Input text (unlimited sentences)"), |
| gr.Dropdown( |
| choices=language_names, |
| label="Select language 1,000+", |
| value="Burmese (mya)", |
| ), |
| ], |
| outputs="audio", |
| ) |
| |
| |
| |
| |
|
|
|
|
| iface.launch() |
|
|