| import os |
| import gradio as gr |
| import requests |
| import langid |
| import base64 |
| import json |
| import time |
| import re |
| import hashlib |
| import hash_code_for_cached_output |
|
|
|
|
| API_URL = os.environ.get("API_URL") |
| supported_languages = ['zh', 'en', 'ja', 'ko', 'es', 'fr'] |
| supported_styles = { |
| 'zh': "zh_default", |
| 'en': [ |
| "en_default", |
| "en_us", |
| "en_br", |
| "en_au", |
| "en_in" |
| ], |
| "es": "es_default", |
| "fr": "fr_default", |
| "ja": "jp_default", |
| "ko": "kr_default" |
| } |
|
|
| output_dir = 'outputs' |
| os.makedirs(output_dir, exist_ok=True) |
|
|
| def audio_to_base64(audio_file): |
| with open(audio_file, "rb") as audio_file: |
| audio_data = audio_file.read() |
| base64_data = base64.b64encode(audio_data).decode("utf-8") |
| return base64_data |
|
|
| def count_chars_words(sentence): |
| segments = re.findall(r'[\u4e00-\u9fa5]+|\w+', sentence) |
| |
| char_count = 0 |
| word_count = 0 |
| for segment in segments: |
| if re.match(r'[\u4e00-\u9fa5]+', segment): |
| char_count += len(segment) |
| else: |
| word_count += len(segment.split()) |
| return char_count + word_count |
|
|
| def predict(prompt, style, audio_file_pth, speed, agree): |
| |
| text_hint = '' |
| |
| if agree == False: |
| text_hint += '[ERROR] Please accept the Terms & Condition!\n' |
| gr.Warning("Please accept the Terms & Condition!") |
| return ( |
| text_hint, |
| None, |
| None, |
| ) |
|
|
| |
| |
| |
| |
| cached_outputs = { |
| "af39e1f1ff_60565a5c20_en_us" : "cached_outputs/0.wav", |
| "af39e1f1ff_420ab8211d_en_us" : "cached_outputs/1.wav", |
| "ced034cc22_0f96bf44f5_es_default" : "cached_outputs/2.wav", |
| "d3172b178d_3fef5adc6f_zh_default" : "cached_outputs/3.wav", |
| "cda6998e1a_9897b60a4e_jp_default" : "cached_outputs/4.wav" |
| } |
| unique_code = hash_code_for_cached_output.get_unique_code(audio_file_pth, prompt, style) |
| print("audio_file_pth is", audio_file_pth) |
| print("unique_code is", unique_code) |
| if unique_code in list(cached_outputs.keys()): |
| return ( |
| 'We get the cached output for you, since you are trying to generate an example cloning.', |
| cached_outputs[unique_code], |
| audio_file_pth, |
| ) |
|
|
| |
| language_predicted = langid.classify(prompt)[0].strip() |
| print(f"Detected language:{language_predicted}") |
|
|
|
|
| if language_predicted not in supported_languages: |
| text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n" |
| gr.Warning( |
| f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}" |
| ) |
|
|
| return ( |
| text_hint, |
| None, |
| None, |
| ) |
|
|
| |
| if style not in supported_styles[language_predicted]: |
| text_hint += f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior.\n" |
| gr.Warning(f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior.") |
|
|
| prompt_length = count_chars_words(prompt) |
|
|
| speaker_wav = audio_file_pth |
|
|
| if prompt_length < 2: |
| text_hint += f"[ERROR] Please give a longer prompt text \n" |
| gr.Warning("Please give a longer prompt text") |
| return ( |
| text_hint, |
| None, |
| None, |
| ) |
| if prompt_length > 50: |
| text_hint += f"[ERROR] Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749 \n" |
| gr.Warning( |
| "Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749" |
| ) |
| return ( |
| text_hint, |
| None, |
| None, |
| ) |
|
|
| save_path = f'{output_dir}/output.wav' |
| speaker_audio_base64 = audio_to_base64(speaker_wav) |
| if style == 'en_us': |
| style = 'en_newest' |
| data = { |
| "text": prompt, |
| "reference_speaker": speaker_audio_base64, |
| "language": style, |
| "speed": speed |
| } |
| |
| start = time.time() |
| |
| response = requests.post(API_URL, json=data, timeout=60) |
| print(f'Get response successfully within {time.time() - start}') |
|
|
| |
| if response.status_code == 200: |
| try: |
| json_data = json.loads(response.content) |
| text_hint += f"[ERROR] {json_data['error']} \n" |
| gr.Warning( |
| f"[ERROR] {json_data['error']} \n" |
| ) |
| return ( |
| text_hint, |
| None, |
| None, |
| ) |
| except: |
| with open(save_path, 'wb') as f: |
| f.write(response.content) |
| else: |
| text_hint += f"[HTTP ERROR] {response.status_code} - {response.text} \n" |
| gr.Warning( |
| f"[HTTP ERROR] {response.status_code} - {response.text} \n" |
| ) |
| return ( |
| text_hint, |
| None, |
| None, |
| ) |
| text_hint += f'''Get response successfully \n''' |
| return ( |
| text_hint, |
| save_path, |
| speaker_wav, |
| ) |
|
|
|
|
| title = "MyShell OpenVoice V2" |
|
|
| description = """ |
| In December 2023, we released [OpenVoice V1](https://huggingface.co/spaces/myshell-ai/OpenVoice), an instant voice cloning approach that replicates a speaker's voice and generates speech in multiple languages using only a short audio clip. OpenVoice V1 enables granular control over voice styles, replicates the tone color of the reference speaker and achieves zero-shot cross-lingual voice cloning. |
| """ |
|
|
| description_v2 = """ |
| In April 2024, we released **OpenVoice V2**, which includes all features in V1 and has: |
| - **Better Audio Quality**. OpenVoice V2 adopts a different training strategy that delivers better audio quality. |
| - **Native Multi-lingual Support**. English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2. |
| - **Free Commercial Use**. Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use. |
| """ |
|
|
| markdown_table = """ |
| <div align="center" style="margin-bottom: 10px;"> |
| |
| | | | | |
| | :-----------: | :-----------: | :-----------: | |
| | **OpenSource Repo** | **Project Page** | **Join the Community** | |
| | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [](https://discord.gg/myshell) | |
| |
| </div> |
| """ |
|
|
| markdown_table_v2 = """ |
| <div align="center" style="margin-bottom: 2px;"> |
| |
| | | | | | |
| | :-----------: | :-----------: | :-----------: | :-----------: | |
| | **Github Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) | |
| |
| | | | |
| | :-----------: | :-----------: | |
| **Join the Community** | [](https://discord.gg/myshell) | |
| |
| </div> |
| """ |
| content = """ |
| <div> |
| <strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>If you want to deploy the model by yourself and perform inference, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part3.ipynb'>this jupyter notebook</a>.</strong> |
| </div> |
| """ |
| wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>" |
|
|
|
|
| examples = [ |
| [ |
| "Did you ever hear a folk tale about a giant turtle?", |
| 'en_us', |
| "examples/speaker0.mp3", |
| True, |
| ],[ |
| "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.", |
| 'es_default', |
| "examples/speaker1.mp3", |
| True, |
| ],[ |
| "我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。", |
| 'zh_default', |
| "examples/speaker2.mp3", |
| True, |
| ],[ |
| "彼は毎朝ジョギングをして体を健康に保っています。", |
| 'jp_default', |
| "examples/speaker3.mp3", |
| True, |
| ], |
| ] |
|
|
| with gr.Blocks(analytics_enabled=False) as demo: |
|
|
| with gr.Row(): |
| with gr.Column(): |
| with gr.Row(): |
| gr.Markdown( |
| """ |
| ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/> |
| """ |
| ) |
| with gr.Row(): |
| gr.Markdown(markdown_table_v2) |
| with gr.Row(): |
| gr.Markdown(description) |
| with gr.Column(): |
| gr.Video('./openvoicev2.mp4', autoplay=True) |
|
|
| with gr.Row(): |
| gr.Markdown(description_v2) |
|
|
| with gr.Row(): |
| gr.HTML(wrapped_markdown_content) |
|
|
| with gr.Row(): |
| with gr.Column(): |
| input_text_gr = gr.Textbox( |
| label="Text Prompt", |
| info="One or two sentences at a time is better. Up to 200 text characters.", |
| value="The bustling city square bustled with street performers, tourists, and local vendors.", |
| ) |
| style_gr = gr.Dropdown( |
| label="Style", |
| info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)", |
| choices=["en_default", "en_us", "en_br", "en_au", "en_in", "es_default", "fr_default", "jp_default", "zh_default", "kr_default",], |
| max_choices=1, |
| value="en_us", |
| ) |
| ref_gr = gr.Audio( |
| label="Reference Audio", |
| info="Click on the ✎ button to upload your own target speaker audio", |
| type="filepath", |
| value="examples/speaker0.mp3", |
| ) |
| tos_gr = gr.Checkbox( |
| label="Agree", |
| value=False, |
| info="I agree to the terms of the MIT license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE", |
| ) |
|
|
| tts_button = gr.Button("Send", elem_id="send-btn", visible=True) |
|
|
|
|
| with gr.Column(): |
| out_text_gr = gr.Text(label="Info") |
| audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True) |
| ref_audio_gr = gr.Audio(label="Reference Audio Used") |
|
|
| gr.Examples(examples, |
| label="Examples", |
| inputs=[input_text_gr, style_gr, ref_gr, tos_gr], |
| outputs=[out_text_gr, audio_gr, ref_audio_gr], |
| fn=predict, |
| cache_examples=False,) |
| tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr]) |
|
|
| demo.queue(concurrency_count=6) |
| demo.launch(debug=True, show_api=True) |