| !pip install -U scipy |
| !git clone https://github.com/neonbjb/tortoise-tts.git |
| %cd tortoise-tts |
| !pip install -r requirements.txt |
| !python setup.py install |
| !pip install gradio |
|
|
| import os |
| import gradio as gr |
| import torchaudio |
| import time |
| from datetime import datetime |
| from tortoise.api import TextToSpeech |
| from tortoise.utils.audio import load_audio, load_voice, load_voices |
| import os |
|
|
| |
| os.environ["COMMANDLINE_ARGS"] = "--no-gradio-queue" |
| VOICE_OPTIONS = [ |
| "random", |
| "custom_voice", |
| "disabled", |
| ] |
|
|
| def inference(text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed): |
| if voice != "custom_voice": |
| voices = [voice] |
| else: |
| voices = [] |
|
|
| if voice_b != "disabled": |
| voices.append(voice_b) |
| if voice_c != "disabled": |
| voices.append(voice_c) |
|
|
| if emotion != "None/Custom": |
| text = f"[I am really {emotion.lower()},] {text}" |
| elif prompt.strip() != "": |
| text = f"[{prompt},] {text}" |
|
|
| c = None |
| if voice == "custom_voice": |
| if mic_audio is None: |
| raise gr.Error("Please provide audio from mic when choosing custom voice") |
| c = load_audio(mic_audio, 22050) |
|
|
| if len(voices) == 1 or len(voices) == 0: |
| if voice == "custom_voice": |
| voice_samples, conditioning_latents = [c], None |
| else: |
| voice_samples, conditioning_latents = load_voice(voice) |
| else: |
| voice_samples, conditioning_latents = load_voices(voices) |
| if voice == "custom_voice": |
| voice_samples.extend([c]) |
|
|
| sample_voice = voice_samples[0] if len(voice_samples) else None |
|
|
| start_time = time.time() |
| gen, _ = tts.tts_with_preset( |
| text, |
| voice_samples=voice_samples, |
| conditioning_latents=conditioning_latents, |
| preset=preset, |
| use_deterministic_seed=seed, |
| return_deterministic_state=True, |
| k=3, |
| ) |
|
|
| with open("Tortoise_TTS_Runs.log", "a") as f: |
| f.write( |
| f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n" |
| ) |
|
|
| return ( |
| (22050, sample_voice.squeeze().cpu().numpy()), |
| (24000, gen[0].squeeze().cpu().numpy()), |
| (24000, gen[1].squeeze().cpu().numpy()), |
| (24000, gen[2].squeeze().cpu().numpy()), |
| ) |
|
|
| def main(): |
| |
| title_html = "<h1 style='text-align: center; color: orange; font-weight: bold;'>RJ VOICE CLONING</h1>" |
|
|
| |
| text = gr.Textbox(lines=4, label="Text:") |
| emotion = gr.Radio( |
| ["None/Custom", "Happy", "Sad", "Angry", "Disgusted", "Arrogant"], |
| value="None/Custom", |
| label="Select emotion:", |
| type="value", |
| ) |
| prompt = gr.Textbox(lines=1, label="Enter prompt if [Custom] emotion:") |
| preset = gr.Radio( |
| ["ultra_fast", "fast", "standard", "high_quality"], |
| value="fast", |
| label="Preset mode (determines quality with tradeoff over speed):", |
| type="value", |
| ) |
| voice = gr.Dropdown( |
| os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS, |
| value="angie", |
| label="Select voice:", |
| type="value", |
| ) |
| mic_audio = gr.Audio( |
| label="Record voice (when selected custom_voice):", |
| type="filepath" |
| ) |
| voice_b = gr.Dropdown( |
| os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS, |
| value="disabled", |
| label="(Optional) Select second voice:", |
| type="value", |
| ) |
| voice_c = gr.Dropdown( |
| os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS, |
| value="disabled", |
| label="(Optional) Select third voice:", |
| type="value", |
| ) |
| seed = gr.Number(value=0, precision=0, label="Seed (for reproducibility):") |
|
|
| selected_voice = gr.Audio(label="Sample of selected voice (first):") |
| output_audio_1 = gr.Audio(label="Output [Candidate 1]:") |
| output_audio_2 = gr.Audio(label="Output [Candidate 2]:") |
| output_audio_3 = gr.Audio(label="Output [Candidate 3]:") |
|
|
| |
| interface = gr.Interface( |
| fn=inference, |
| inputs=[text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed], |
| outputs=[selected_voice, output_audio_1, output_audio_2, output_audio_3], |
| title="RJ VOICE CLONING", |
| description=title_html, |
| css=".gradio-container { background-color: black; color: orange; }" |
| ) |
|
|
| |
| interface.launch(share=True) |
|
|
| if __name__ == "__main__": |
| tts = TextToSpeech() |
|
|
| with open("Tortoise_TTS_Runs.log", "a") as f: |
| f.write( |
| f"\n\n-------------------------Tortoise TTS Logs, {datetime.now()}-------------------------\n" |
| ) |
|
|
| main() |
|
|