File size: 4,394 Bytes
e791b3f
 
 
c9f7c1d
e791b3f
c9f7c1d
 
 
 
 
 
e791b3f
c9f7c1d
e791b3f
c9f7c1d
 
e791b3f
c9f7c1d
 
 
 
 
 
 
 
e791b3f
 
 
c9f7c1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e791b3f
c9f7c1d
 
e791b3f
c9f7c1d
98e26c7
c9f7c1d
 
e791b3f
 
98e26c7
e791b3f
 
c9f7c1d
 
 
e791b3f
c9f7c1d
98e26c7
c9f7c1d
 
 
 
 
 
 
 
 
 
 
 
e791b3f
98e26c7
c9f7c1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98e26c7
 
 
 
c9f7c1d
e791b3f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
from supertonic import TTS
import os
import numpy as np

# Initialize TTS - auto_download=True handles the HF model fetching automatically
# This runs on CPU by default via ONNX
try:
    tts = TTS(auto_download=True)
except Exception as e:
    print(f"Error initializing TTS: {e}")

# All available voices found in the repository tree
VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4", "F5"]

# The 31 supported languages
LANGUAGES = {
    "English": "en", "Korean": "ko", "Japanese": "ja", "Arabic": "ar",
    "Bulgarian": "bg", "Czech": "cs", "Danish": "da", "German": "de",
    "Greek": "el", "Spanish": "es", "Estonian": "et", "Finnish": "fi",
    "French": "fr", "Hindi": "hi", "Croatian": "hr", "Hungarian": "hu",
    "Indonesian": "id", "Italian": "it", "Lithuanian": "lt", "Latvian": "lv",
    "Dutch": "nl", "Polish": "pl", "Portuguese": "pt", "Romanian": "ro",
    "Russian": "ru", "Slovak": "sk", "Slovenian": "sl", "Swedish": "sv",
    "Turkish": "tr", "Ukrainian": "uk", "Vietnamese": "vi"
}

def generate_speech(text, voice, language_name):
    if not text.strip():
        raise gr.Error("Please enter some text.")
        
    try:
        lang_code = LANGUAGES[language_name]
        
        # Get the voice style object
        style = tts.get_voice_style(voice_name=voice)
        
        # Synthesize (Returns wav data and a numpy array for duration)
        wav, duration = tts.synthesize(text, voice_style=style, lang=lang_code)
        
        # Save to a temporary path
        output_path = "output.wav"
        tts.save_audio(wav, output_path)
        
        # FIX: Convert numpy.ndarray duration to float for f-string compatibility
        readable_duration = float(duration)
        
        return output_path, f"Generation Successful! \nDuration: {readable_duration:.2f}s"
    
    except Exception as e:
        raise gr.Error(f"Generation failed: {str(e)}")

# Define the Gradio Interface
with gr.Blocks(theme='soft', title="Supertonic 3 TTS") as demo:
    gr.Markdown("# 🎙️ Supertonic 3: Multilingual TTS")
    gr.Markdown("An on-device, lightweight Text-to-Speech system by **Supertone**. Running on CPU via ONNX.")
    
    with gr.Row():
        with gr.Column(scale=1):
            input_text = gr.Textbox(
                label="Input Text", 
                placeholder="Type your message here...",
                value="The train delay was announced at 4:45 PM <breath> due to track maintenance.",
                lines=4
            )
            
            with gr.Row():
                voice_opt = gr.Dropdown(
                    choices=VOICES, 
                    value="M1", 
                    label="Voice Style"
                )
                lang_opt = gr.Dropdown(
                    choices=sorted(list(LANGUAGES.keys())), 
                    value="English", 
                    label="Language"
                )
            
            btn = gr.Button("Synthesize Speech", variant="primary")
            
        with gr.Column(scale=1):
            audio_output = gr.Audio(label="Synthesized Audio", type="filepath")
            status_box = gr.Textbox(label="Status", interactive=False)
            
            gr.HTML("""
                <div style="margin-top: 20px; text-align: center;">
                    <p>Supported Expression Tags: <code>&lt;laugh&gt;</code>, <code>&lt;breath&gt;</code>, <code>&lt;sigh&gt;</code></p>
                    <a href="https://huggingface.co/Supertone/supertonic-3">
                        <img src="https://img.shields.io/badge/Model-Supertone%20Supertonic--3-blue?logo=huggingface" alt="Model Page">
                    </a>
                </div>
            """)

    # Setup Examples
    gr.Examples(
        examples=[
            ["Hello! This is a test of the Supertonic 3 system running locally.", "M1", "English"],
            ["こんにちは、これは日本語の音声合成のテストです。", "F1", "Japanese"],
            ["¡Hola! Esta es una prueba de voz en español.", "F3", "Spanish"],
            ["C'est un plaisir de vous rencontrer.", "M4", "French"]
        ],
        inputs=[input_text, voice_opt, lang_opt]
    )

    btn.click(
        fn=generate_speech, 
        inputs=[input_text, voice_opt, lang_opt], 
        outputs=[audio_output, status_box]
    )

if __name__ == "__main__":
    demo.launch()