Spaces:
Running
Running
File size: 4,394 Bytes
e791b3f c9f7c1d e791b3f c9f7c1d e791b3f c9f7c1d e791b3f c9f7c1d e791b3f c9f7c1d e791b3f c9f7c1d e791b3f c9f7c1d e791b3f c9f7c1d 98e26c7 c9f7c1d e791b3f 98e26c7 e791b3f c9f7c1d e791b3f c9f7c1d 98e26c7 c9f7c1d e791b3f 98e26c7 c9f7c1d 98e26c7 c9f7c1d e791b3f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | import gradio as gr
from supertonic import TTS
import os
import numpy as np
# Initialize TTS - auto_download=True handles the HF model fetching automatically
# This runs on CPU by default via ONNX
try:
tts = TTS(auto_download=True)
except Exception as e:
print(f"Error initializing TTS: {e}")
# All available voices found in the repository tree
VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4", "F5"]
# The 31 supported languages
LANGUAGES = {
"English": "en", "Korean": "ko", "Japanese": "ja", "Arabic": "ar",
"Bulgarian": "bg", "Czech": "cs", "Danish": "da", "German": "de",
"Greek": "el", "Spanish": "es", "Estonian": "et", "Finnish": "fi",
"French": "fr", "Hindi": "hi", "Croatian": "hr", "Hungarian": "hu",
"Indonesian": "id", "Italian": "it", "Lithuanian": "lt", "Latvian": "lv",
"Dutch": "nl", "Polish": "pl", "Portuguese": "pt", "Romanian": "ro",
"Russian": "ru", "Slovak": "sk", "Slovenian": "sl", "Swedish": "sv",
"Turkish": "tr", "Ukrainian": "uk", "Vietnamese": "vi"
}
def generate_speech(text, voice, language_name):
if not text.strip():
raise gr.Error("Please enter some text.")
try:
lang_code = LANGUAGES[language_name]
# Get the voice style object
style = tts.get_voice_style(voice_name=voice)
# Synthesize (Returns wav data and a numpy array for duration)
wav, duration = tts.synthesize(text, voice_style=style, lang=lang_code)
# Save to a temporary path
output_path = "output.wav"
tts.save_audio(wav, output_path)
# FIX: Convert numpy.ndarray duration to float for f-string compatibility
readable_duration = float(duration)
return output_path, f"Generation Successful! \nDuration: {readable_duration:.2f}s"
except Exception as e:
raise gr.Error(f"Generation failed: {str(e)}")
# Define the Gradio Interface
with gr.Blocks(theme='soft', title="Supertonic 3 TTS") as demo:
gr.Markdown("# 🎙️ Supertonic 3: Multilingual TTS")
gr.Markdown("An on-device, lightweight Text-to-Speech system by **Supertone**. Running on CPU via ONNX.")
with gr.Row():
with gr.Column(scale=1):
input_text = gr.Textbox(
label="Input Text",
placeholder="Type your message here...",
value="The train delay was announced at 4:45 PM <breath> due to track maintenance.",
lines=4
)
with gr.Row():
voice_opt = gr.Dropdown(
choices=VOICES,
value="M1",
label="Voice Style"
)
lang_opt = gr.Dropdown(
choices=sorted(list(LANGUAGES.keys())),
value="English",
label="Language"
)
btn = gr.Button("Synthesize Speech", variant="primary")
with gr.Column(scale=1):
audio_output = gr.Audio(label="Synthesized Audio", type="filepath")
status_box = gr.Textbox(label="Status", interactive=False)
gr.HTML("""
<div style="margin-top: 20px; text-align: center;">
<p>Supported Expression Tags: <code><laugh></code>, <code><breath></code>, <code><sigh></code></p>
<a href="https://huggingface.co/Supertone/supertonic-3">
<img src="https://img.shields.io/badge/Model-Supertone%20Supertonic--3-blue?logo=huggingface" alt="Model Page">
</a>
</div>
""")
# Setup Examples
gr.Examples(
examples=[
["Hello! This is a test of the Supertonic 3 system running locally.", "M1", "English"],
["こんにちは、これは日本語の音声合成のテストです。", "F1", "Japanese"],
["¡Hola! Esta es una prueba de voz en español.", "F3", "Spanish"],
["C'est un plaisir de vous rencontrer.", "M4", "French"]
],
inputs=[input_text, voice_opt, lang_opt]
)
btn.click(
fn=generate_speech,
inputs=[input_text, voice_opt, lang_opt],
outputs=[audio_output, status_box]
)
if __name__ == "__main__":
demo.launch()
|