| from turtle import title |
| import gradio as gr |
|
|
| import git |
| import os |
| os.system('git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS') |
| os.system('pip install -q -e TTS/') |
| os.system('pip install -q torchaudio==0.9.0') |
|
|
| import sys |
| TTS_PATH = "TTS/" |
|
|
| |
| sys.path.append(TTS_PATH) |
|
|
| import os |
| import string |
| import time |
| import argparse |
| import json |
|
|
| import numpy as np |
| import IPython |
| from IPython.display import Audio |
|
|
|
|
| import torch |
|
|
| from TTS.tts.utils.synthesis import synthesis |
| |
| try: |
| from TTS.utils.audio import AudioProcessor |
| except: |
| from TTS.utils.audio import AudioProcessor |
|
|
|
|
| from TTS.tts.models import setup_model |
| from TTS.config import load_config |
| from TTS.tts.models.vits import * |
|
|
| OUT_PATH = 'out/' |
|
|
| |
| os.makedirs(OUT_PATH, exist_ok=True) |
|
|
| |
| MODEL_PATH = '/home/user/app/best_model_latest.pth.tar' |
| CONFIG_PATH = '/home/user/app/config.json' |
| TTS_LANGUAGES = "/home/user/app/language_ids.json" |
| TTS_SPEAKERS = "/home/user/app/speakers.json" |
| USE_CUDA = torch.cuda.is_available() |
|
|
| |
| C = load_config(CONFIG_PATH) |
|
|
|
|
| |
| ap = AudioProcessor(**C.audio) |
|
|
| speaker_embedding = None |
|
|
| C.model_args['d_vector_file'] = TTS_SPEAKERS |
| C.model_args['use_speaker_encoder_as_loss'] = False |
|
|
| model = setup_model(C) |
| model.language_manager.set_language_ids_from_file(TTS_LANGUAGES) |
| |
| |
| cp = torch.load(MODEL_PATH, map_location=torch.device('cpu')) |
| |
| model_weights = cp['model'].copy() |
| for key in list(model_weights.keys()): |
| if "speaker_encoder" in key: |
| del model_weights[key] |
|
|
| model.load_state_dict(model_weights) |
|
|
|
|
| model.eval() |
|
|
| if USE_CUDA: |
| model = model.cuda() |
|
|
| |
| use_griffin_lim = False |
|
|
| os.system('pip install -q pydub ffmpeg-normalize') |
|
|
| CONFIG_SE_PATH = "config_se.json" |
| CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar" |
|
|
| from TTS.tts.utils.speakers import SpeakerManager |
| from pydub import AudioSegment |
| import librosa |
|
|
| SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA) |
|
|
| def compute_spec(ref_file): |
| y, sr = librosa.load(ref_file, sr=ap.sample_rate) |
| spec = ap.spectrogram(y) |
| spec = torch.FloatTensor(spec).unsqueeze(0) |
| return spec |
| |
|
|
| |
| def greet(Text,Voicetoclone,VoiceMicrophone): |
| text= "%s" % (Text) |
| if Voicetoclone is not None: |
| reference_files= "%s" % (Voicetoclone) |
| print("path url") |
| print(Voicetoclone) |
| sample= str(Voicetoclone) |
| else: |
| reference_files= "%s" % (VoiceMicrophone) |
| print("path url") |
| print(VoiceMicrophone) |
| sample= str(VoiceMicrophone) |
| size= len(reference_files)*sys.getsizeof(reference_files) |
| size2= size / 1000000 |
| if (size2 > 0.012) or len(text)>2000: |
| message="File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes." |
| print(message) |
| raise SystemExit("File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes.") |
| else: |
| os.system('ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f') |
| reference_emb = SE_speaker_manager.compute_d_vector_from_clip(reference_files) |
| model.length_scale = 1 |
| model.inference_noise_scale = 0.3 |
| model.inference_noise_scale_dp = 0.3 |
| text = text |
| model.language_manager.language_id_mapping |
| language_id = 0 |
| |
| print(" > text: {}".format(text)) |
| wav, alignment, _, _ = synthesis( |
| model, |
| text, |
| C, |
| "cuda" in str(next(model.parameters()).device), |
| ap, |
| speaker_id=None, |
| d_vector=reference_emb, |
| style_wav=None, |
| language_id=language_id, |
| enable_eos_bos_chars=C.enable_eos_bos_chars, |
| use_griffin_lim=True, |
| do_trim_silence=False, |
| ).values() |
| print("Generated Audio") |
| IPython.display.display(Audio(wav, rate=ap.sample_rate)) |
| |
| |
| file_name="Audio.wav" |
| out_path = os.path.join(OUT_PATH, file_name) |
| print(" > Saving output to {}".format(out_path)) |
| ap.save_wav(wav, out_path) |
| return out_path |
|
|
| demo = gr.Interface( |
| fn=greet, |
| inputs=[gr.inputs.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),gr.Audio(type="filepath", source="upload",label='Please upload a voice to clone (max. 30mb)'),gr.Audio(source="microphone", type="filepath", streaming=True)], |
| outputs="audio", |
| title="Bilal's Voice Cloning Tool" |
| ) |
| demo.launch() |