Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import os | |
| import soundfile as sf | |
| from core.cloner import KokoClone | |
| # 1. Initialize the cloner globally so models load only once when the server starts | |
| print("Loading KokoClone models for the Web UI...") | |
| cloner = KokoClone() | |
| def clone_voice(text, lang, ref_audio_path): | |
| """Gradio handler: text + reference audio β cloned speech.""" | |
| if not text or not text.strip(): | |
| raise gr.Error("Please enter some text.") | |
| if not ref_audio_path: | |
| raise gr.Error("Please upload or record a reference audio file.") | |
| output_file = "gradio_output.wav" | |
| try: | |
| cloner.generate( | |
| text=text, | |
| lang=lang, | |
| reference_audio=ref_audio_path, | |
| output_path=output_file | |
| ) | |
| return output_file | |
| except Exception as e: | |
| raise gr.Error(f"An error occurred during generation: {str(e)}") | |
| def convert_voice(source_audio_path, ref_audio_path): | |
| """Gradio handler: source audio + reference audio β re-voiced speech.""" | |
| if not source_audio_path: | |
| raise gr.Error("Please upload or record a source audio file.") | |
| if not ref_audio_path: | |
| raise gr.Error("Please upload or record a reference audio file.") | |
| output_file = "gradio_convert_output.wav" | |
| try: | |
| cloner.convert( | |
| source_audio=source_audio_path, | |
| reference_audio=ref_audio_path, | |
| output_path=output_file | |
| ) | |
| return output_file | |
| except Exception as e: | |
| raise gr.Error(f"An error occurred during conversion: {str(e)}") | |
| # 2. Build the Gradio UI using Blocks | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| <div style="text-align: center;"> | |
| <h1>π§ KokoClone</h1> | |
| <p>Voice Cloning, Now Inside Kokoro.<br> | |
| Generate natural multilingual speech and clone any target voice with ease.<br> | |
| <i>Built on Kokoro TTS.</i></p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| # ββ Tab 1: Text β Cloned Speech βββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π€ Text β Clone"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| text_input = gr.Textbox( | |
| label="1. Text to Synthesize", | |
| lines=4, | |
| placeholder="Enter the text you want spoken..." | |
| ) | |
| lang_input = gr.Dropdown( | |
| label="2. Language", | |
| choices=[ | |
| ("English", "en"), | |
| ("Hindi", "hi"), | |
| ("French", "fr"), | |
| ("Japanese", "ja"), | |
| ("Chinese", "zh"), | |
| ("Italian", "it"), | |
| ("Spanish", "es"), | |
| ("Portuguese", "pt") | |
| ], | |
| value="en" | |
| ) | |
| ref_audio_input = gr.Audio( | |
| label="3. Reference Voice (Upload or Record)", | |
| type="filepath" | |
| ) | |
| submit_btn = gr.Button("π Generate Clone", variant="primary") | |
| with gr.Column(scale=1): | |
| output_audio = gr.Audio( | |
| label="Generated Cloned Audio", | |
| interactive=False, | |
| autoplay=False | |
| ) | |
| gr.Markdown( | |
| """ | |
| <br> | |
| ### π‘ Tips for Best Results: | |
| * **Clean Audio:** Use a reference audio clip without background noise or music. | |
| * **Length:** A reference clip of 3 to 10 seconds is usually the sweet spot. | |
| * **Language Match:** Make sure the selected language matches the text you typed! | |
| * **First Run:** The very first generation might take a few extra seconds while the models allocate memory. | |
| """ | |
| ) | |
| submit_btn.click( | |
| fn=lambda: gr.update(value="β Generating...", interactive=False), | |
| outputs=submit_btn | |
| ).then( | |
| fn=clone_voice, | |
| inputs=[text_input, lang_input, ref_audio_input], | |
| outputs=output_audio | |
| ).then( | |
| fn=lambda: gr.update(value="π Generate Clone", interactive=True), | |
| outputs=submit_btn | |
| ) | |
| # ββ Tab 2: Audio β Re-voiced Speech βββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π Audio β Clone"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| source_audio_input = gr.Audio( | |
| label="1. Source Audio (speech to re-voice)", | |
| type="filepath" | |
| ) | |
| ref_audio_convert_input = gr.Audio( | |
| label="2. Reference Voice (target speaker)", | |
| type="filepath" | |
| ) | |
| convert_btn = gr.Button("π Convert Voice", variant="primary") | |
| with gr.Column(scale=1): | |
| convert_output_audio = gr.Audio( | |
| label="Converted Audio", | |
| interactive=False, | |
| autoplay=False | |
| ) | |
| gr.Markdown( | |
| """ | |
| <br> | |
| ### π‘ How it works: | |
| * Upload any speech recording as the **source**. | |
| * Upload a short clip of the **target speaker** as the reference. | |
| * KokoClone re-voices the source speech to sound like the reference β no transcription needed. | |
| ### Tips: | |
| * Clean, noise-free audio works best for both inputs. | |
| * Reference clips of 3β10 seconds give the best voice transfer. | |
| """ | |
| ) | |
| convert_btn.click( | |
| fn=lambda: gr.update(value="β Converting...", interactive=False), | |
| outputs=convert_btn | |
| ).then( | |
| fn=convert_voice, | |
| inputs=[source_audio_input, ref_audio_convert_input], | |
| outputs=convert_output_audio | |
| ).then( | |
| fn=lambda: gr.update(value="π Convert Voice", interactive=True), | |
| outputs=convert_btn | |
| ) | |
| # 4. Launch the app | |
| if __name__ == "__main__": | |
| # Gradio 6.0 fix: Moved theme here and removed show_api | |
| demo.launch(server_name="0.0.0.0") |