Spaces:

stano03
/

jambogpt

Sleeping

App Files Files Community

JamboGPT Bot commited on 4 days ago

Commit

6ddb91c

1 Parent(s): 504b4e1

Add multiple TTS models for Kiswahili and Kikuyu with voice selection

Browse files

Files changed (1) hide show

app.py +87 -40

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
 JamboGPT - African Language AI Voice Agent
-Using Free Hugging Face TTS Models
 """
 import gradio as gr
@@ -15,13 +15,19 @@ import tempfile
 # Set device
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Language configurations with free HF TTS models
 LANGUAGES = {
     "Swahili": {
         "emoji": "🇰🇪",
         "speakers": "100M+",
         "region": "East Africa",
-        "tts_model": "facebook/mms-tts-swh",
         "keywords": {
             "greeting": ["habari", "jambo", "salaam", "hello", "hi"],
             "thanks": ["asante", "thank", "shukran"],
@@ -40,7 +46,12 @@ LANGUAGES = {
         "emoji": "🇰🇪",
         "speakers": "7M",
         "region": "Kenya",
-        "tts_model": "facebook/mms-tts-kin",
         "keywords": {
             "greeting": ["wĩ", "mwega", "hello", "hi", "salaam"],
             "thanks": ["mwega", "thank", "asante"],
@@ -59,7 +70,10 @@ LANGUAGES = {
         "emoji": "🇳🇬",
         "speakers": "45M",
         "region": "West Africa",
-        "tts_model": "facebook/mms-tts-yor",
         "keywords": {
             "greeting": ["pele", "hello", "hi", "bawo"],
             "thanks": ["e ku", "thank", "ope"],
@@ -78,7 +92,10 @@ LANGUAGES = {
         "emoji": "🇳🇬",
         "speakers": "90M",
         "region": "West Africa",
-        "tts_model": "facebook/mms-tts-hau",
         "keywords": {
             "greeting": ["sannu", "hello", "hi", "ina"],
             "thanks": ["nagode", "thank"],
@@ -97,7 +114,10 @@ LANGUAGES = {
         "emoji": "🇪🇹",
         "speakers": "32M",
         "region": "Horn of Africa",
-        "tts_model": "facebook/mms-tts-amh",
         "keywords": {
             "greeting": ["ሰላም", "hello", "hi", "ሳላም"],
             "thanks": ["አመሰግናለሁ", "thank"],
@@ -116,7 +136,10 @@ LANGUAGES = {
         "emoji": "🇧🇯",
         "speakers": "2M",
         "region": "West Africa",
-        "tts_model": "facebook/mms-tts-fon",
         "keywords": {
             "greeting": ["bonjour", "hello", "hi"],
             "thanks": ["merci", "thank"],
@@ -135,7 +158,10 @@ LANGUAGES = {
         "emoji": "🇪🇹",
         "speakers": "40M",
         "region": "East Africa",
-        "tts_model": "facebook/mms-tts-orm",
         "keywords": {
             "greeting": ["salaam", "hello", "hi"],
             "thanks": ["galataa", "thank"],
@@ -154,7 +180,10 @@ LANGUAGES = {
         "emoji": "🇸🇴",
         "speakers": "20M",
         "region": "East Africa",
-        "tts_model": "facebook/mms-tts-som",
         "keywords": {
             "greeting": ["salaam", "hello", "hi"],
             "thanks": ["mahadsanid", "thank"],
@@ -173,7 +202,10 @@ LANGUAGES = {
         "emoji": "🇪🇷",
         "speakers": "7M",
         "region": "Horn of Africa",
-        "tts_model": "facebook/mms-tts-tir",
         "keywords": {
             "greeting": ["ሰላም", "hello", "hi"],
             "thanks": ["አመሰግናለሁ", "thank"],
@@ -192,7 +224,10 @@ LANGUAGES = {
         "emoji": "🌍",
         "speakers": "1.5B",
         "region": "Global",
-        "tts_model": "facebook/mms-tts-eng",
         "keywords": {
             "greeting": ["hello", "hi", "hey", "greetings"],
             "thanks": ["thank", "thanks", "appreciate"],
@@ -212,19 +247,13 @@ LANGUAGES = {
 conversation_history = []
 model_cache = {}
-def load_tts_model(language_name):
-    """Load TTS model for the specified language."""
-    if language_name not in LANGUAGES:
-        return None
-    lang_config = LANGUAGES[language_name]
-    model_id = lang_config["tts_model"]
     if model_id in model_cache:
         return model_cache[model_id]
     try:
-        print(f"Loading TTS model for {language_name}: {model_id}")
         synthesizer = pipeline(
             "text-to-speech",
             model=model_id,
@@ -255,7 +284,6 @@ def generate_response(text, language):
         lang_config = LANGUAGES.get(language, {})
         responses = lang_config.get("responses", {})
-        # Detect intent
         intent = detect_intent(text, language)
         response = responses.get(intent, responses.get("default", "I understand."))
@@ -264,17 +292,17 @@ def generate_response(text, language):
         print(f"Error generating response: {e}")
         return "I understand. Can you say more?"
-def synthesize_speech(text, language):
-    """Convert text to speech using HF models."""
     if not text or not text.strip():
         return None
     try:
-        synthesizer = load_tts_model(language)
         if synthesizer is None:
             return None
-        print(f"Generating speech for: {text[:50]}...")
         speech = synthesizer(text)
         audio_array = np.array(speech["audio"]).flatten()
@@ -289,36 +317,32 @@ def synthesize_speech(text, language):
         print(f"Error synthesizing: {e}")
         return None
-def process_text_input(text, language):
     """Process text input: generate response -> synthesize."""
     try:
         if not text:
             return None, "Please enter some text!", ""
-        # Generate response
         response_text = generate_response(text, language)
         if response_text is None:
             return None, "Error generating response", ""
-        # Synthesize response
-        audio_output = synthesize_speech(response_text, language)
-        # Add to history
         conversation_history.append({
             "user": text,
             "agent": response_text,
             "language": language,
             "timestamp": datetime.now().strftime("%H:%M:%S")
         })
-        # Format history
         history_text = ""
         for msg in conversation_history[-5:]:
             history_text += f"[{msg['timestamp']}] {msg['language']}\n"
             history_text += f"You: {msg['user']}\n"
             history_text += f"Agent: {msg['agent']}\n\n"
-        status = "✅ Speech generated!" if audio_output else "⚠️ Text response only"
         return audio_output, response_text, history_text
     except Exception as e:
         print(f"Error processing: {e}")
@@ -335,7 +359,7 @@ def create_interface():
         gr.Markdown("""
         # 🌍 JamboGPT - African Language AI Voice Agent
-        **Chat with AI in 10 African languages with voice responses**
         Swahili • Kikuyu • Yoruba • Hausa • Amharic • Fon • Oromo • Somali • Tigrinya • English
         """)
@@ -354,13 +378,37 @@ def create_interface():
                 f"🇰🇪 **Swahili** • 100M+ speakers • East Africa"
             )
             def update_language_info(language):
                 if language in LANGUAGES:
                     lang_data = LANGUAGES[language]
-                    return f"{lang_data['emoji']} **{language}** • {lang_data['speakers']} speakers • {lang_data['region']}"
-                return ""
-            language_choice.change(update_language_info, inputs=language_choice, outputs=language_info)
             # Text input
             text_input = gr.Textbox(
@@ -391,7 +439,6 @@ def create_interface():
                 interactive=False
             )
-            # Conversation history
             history_display = gr.Textbox(
                 label="📝 Conversation History",
                 interactive=False,
@@ -402,7 +449,7 @@ def create_interface():
         # Connect process button
         process_btn.click(
             fn=process_text_input,
-            inputs=[text_input, language_choice],
             outputs=[audio_output, agent_response, history_display]
         )
@@ -424,7 +471,7 @@ def create_interface():
         ---
         **JamboGPT** - Making AI Accessible to African Languages
-        🔗 [GitHub](https://github.com/stano03/jambogpt) | 📊 [Dataset](https://huggingface.co/datasets/stano03/jambogpt-real-dataset) | 🤖 [Model](https://huggingface.co/stano03/jambogpt-swahili-tts-v1)
         """)
     return demo

 #!/usr/bin/env python3
 """
 JamboGPT - African Language AI Voice Agent
+Multiple TTS Models for Kiswahili & Kikuyu
 """
 import gradio as gr
 # Set device
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Language configurations with multiple TTS models
 LANGUAGES = {
     "Swahili": {
         "emoji": "🇰🇪",
         "speakers": "100M+",
         "region": "East Africa",
+        "tts_models": [
+            ("Benjamin-png/swahili-mms-tts-finetuned", "🌟 Benjamin TTS (Best Quality)"),
+            ("facebook/mms-tts-swh", "Meta MMS Swahili"),
+            ("multilingual-tts/F5-TTS-OpenBible-Swahili", "F5 TTS OpenBible"),
+            ("stano03/jambogpt-swahili-tts-v1", "JamboGPT Custom Model"),
+        ],
+        "default_model": "Benjamin-png/swahili-mms-tts-finetuned",
         "keywords": {
             "greeting": ["habari", "jambo", "salaam", "hello", "hi"],
             "thanks": ["asante", "thank", "shukran"],
         "emoji": "🇰🇪",
         "speakers": "7M",
         "region": "Kenya",
+        "tts_models": [
+            ("multilingual-tts/F5-TTS-OpenBible-Kikuyu", "🌟 F5 TTS OpenBible (Best)"),
+            ("facebook/mms-tts-kin", "Meta MMS Kikuyu"),
+            ("multilingual-tts/VITS-OpenBible-Kikuyu", "VITS OpenBible"),
+        ],
+        "default_model": "multilingual-tts/F5-TTS-OpenBible-Kikuyu",
         "keywords": {
             "greeting": ["wĩ", "mwega", "hello", "hi", "salaam"],
             "thanks": ["mwega", "thank", "asante"],
         "emoji": "🇳🇬",
         "speakers": "45M",
         "region": "West Africa",
+        "tts_models": [
+            ("facebook/mms-tts-yor", "Meta MMS Yoruba"),
+        ],
+        "default_model": "facebook/mms-tts-yor",
         "keywords": {
             "greeting": ["pele", "hello", "hi", "bawo"],
             "thanks": ["e ku", "thank", "ope"],
         "emoji": "🇳🇬",
         "speakers": "90M",
         "region": "West Africa",
+        "tts_models": [
+            ("facebook/mms-tts-hau", "Meta MMS Hausa"),
+        ],
+        "default_model": "facebook/mms-tts-hau",
         "keywords": {
             "greeting": ["sannu", "hello", "hi", "ina"],
             "thanks": ["nagode", "thank"],
         "emoji": "🇪🇹",
         "speakers": "32M",
         "region": "Horn of Africa",
+        "tts_models": [
+            ("facebook/mms-tts-amh", "Meta MMS Amharic"),
+        ],
+        "default_model": "facebook/mms-tts-amh",
         "keywords": {
             "greeting": ["ሰላም", "hello", "hi", "ሳላም"],
             "thanks": ["አመሰግናለሁ", "thank"],
         "emoji": "🇧🇯",
         "speakers": "2M",
         "region": "West Africa",
+        "tts_models": [
+            ("facebook/mms-tts-fon", "Meta MMS Fon"),
+        ],
+        "default_model": "facebook/mms-tts-fon",
         "keywords": {
             "greeting": ["bonjour", "hello", "hi"],
             "thanks": ["merci", "thank"],
         "emoji": "🇪🇹",
         "speakers": "40M",
         "region": "East Africa",
+        "tts_models": [
+            ("facebook/mms-tts-orm", "Meta MMS Oromo"),
+        ],
+        "default_model": "facebook/mms-tts-orm",
         "keywords": {
             "greeting": ["salaam", "hello", "hi"],
             "thanks": ["galataa", "thank"],
         "emoji": "🇸🇴",
         "speakers": "20M",
         "region": "East Africa",
+        "tts_models": [
+            ("facebook/mms-tts-som", "Meta MMS Somali"),
+        ],
+        "default_model": "facebook/mms-tts-som",
         "keywords": {
             "greeting": ["salaam", "hello", "hi"],
             "thanks": ["mahadsanid", "thank"],
         "emoji": "🇪🇷",
         "speakers": "7M",
         "region": "Horn of Africa",
+        "tts_models": [
+            ("facebook/mms-tts-tir", "Meta MMS Tigrinya"),
+        ],
+        "default_model": "facebook/mms-tts-tir",
         "keywords": {
             "greeting": ["ሰላም", "hello", "hi"],
             "thanks": ["አመሰግናለሁ", "thank"],
         "emoji": "🌍",
         "speakers": "1.5B",
         "region": "Global",
+        "tts_models": [
+            ("facebook/mms-tts-eng", "Meta MMS English"),
+        ],
+        "default_model": "facebook/mms-tts-eng",
         "keywords": {
             "greeting": ["hello", "hi", "hey", "greetings"],
             "thanks": ["thank", "thanks", "appreciate"],
 conversation_history = []
 model_cache = {}
+def load_tts_model(model_id):
+    """Load TTS model."""
     if model_id in model_cache:
         return model_cache[model_id]
     try:
+        print(f"Loading TTS model: {model_id}")
         synthesizer = pipeline(
             "text-to-speech",
             model=model_id,
         lang_config = LANGUAGES.get(language, {})
         responses = lang_config.get("responses", {})
         intent = detect_intent(text, language)
         response = responses.get(intent, responses.get("default", "I understand."))
         print(f"Error generating response: {e}")
         return "I understand. Can you say more?"
+def synthesize_speech(text, language, model_name):
+    """Convert text to speech using selected model."""
     if not text or not text.strip():
         return None
     try:
+        synthesizer = load_tts_model(model_name)
         if synthesizer is None:
             return None
+        print(f"Generating speech with {model_name}: {text[:50]}...")
         speech = synthesizer(text)
         audio_array = np.array(speech["audio"]).flatten()
         print(f"Error synthesizing: {e}")
         return None
+def process_text_input(text, language, tts_model):
     """Process text input: generate response -> synthesize."""
     try:
         if not text:
             return None, "Please enter some text!", ""
         response_text = generate_response(text, language)
         if response_text is None:
             return None, "Error generating response", ""
+        audio_output = synthesize_speech(response_text, language, tts_model)
         conversation_history.append({
             "user": text,
             "agent": response_text,
             "language": language,
+            "model": tts_model,
             "timestamp": datetime.now().strftime("%H:%M:%S")
         })
         history_text = ""
         for msg in conversation_history[-5:]:
             history_text += f"[{msg['timestamp']}] {msg['language']}\n"
             history_text += f"You: {msg['user']}\n"
             history_text += f"Agent: {msg['agent']}\n\n"
         return audio_output, response_text, history_text
     except Exception as e:
         print(f"Error processing: {e}")
         gr.Markdown("""
         # 🌍 JamboGPT - African Language AI Voice Agent
+        **Chat with AI in 10 African languages with multiple voice options**
         Swahili • Kikuyu • Yoruba • Hausa • Amharic • Fon • Oromo • Somali • Tigrinya • English
         """)
                 f"🇰🇪 **Swahili** • 100M+ speakers • East Africa"
             )
+            # TTS Model selector (dynamic based on language)
+            tts_model_choice = gr.Dropdown(
+                choices=[("🌟 Benjamin TTS (Best Quality)", "Benjamin-png/swahili-mms-tts-finetuned"),
+                        ("Meta MMS Swahili", "facebook/mms-tts-swh"),
+                        ("F5 TTS OpenBible", "multilingual-tts/F5-TTS-OpenBible-Swahili"),
+                        ("JamboGPT Custom Model", "stano03/jambogpt-swahili-tts-v1")],
+                value="Benjamin-png/swahili-mms-tts-finetuned",
+                label="Select Voice Model",
+                interactive=True
+            )
             def update_language_info(language):
                 if language in LANGUAGES:
                     lang_data = LANGUAGES[language]
+                    models = lang_data.get("tts_models", [])
+                    # Update language info
+                    info_text = f"{lang_data['emoji']} **{language}** • {lang_data['speakers']} speakers • {lang_data['region']}"
+                    # Update model choices
+                    model_choices = models
+                    default_model = lang_data.get("default_model", models[0][0])
+                    return info_text, gr.Dropdown(choices=model_choices, value=default_model)
+                return "", gr.Dropdown(choices=[])
+            language_choice.change(
+                update_language_info,
+                inputs=language_choice,
+                outputs=[language_info, tts_model_choice]
+            )
             # Text input
             text_input = gr.Textbox(
                 interactive=False
             )
             history_display = gr.Textbox(
                 label="📝 Conversation History",
                 interactive=False,
         # Connect process button
         process_btn.click(
             fn=process_text_input,
+            inputs=[text_input, language_choice, tts_model_choice],
             outputs=[audio_output, agent_response, history_display]
         )
         ---
         **JamboGPT** - Making AI Accessible to African Languages
+        🔗 [GitHub](https://github.com/stano03/jambogpt) | 📊 [Dataset](https://huggingface.co/datasets/stano03/jambogpt-real-dataset) | 🤖 [Models](https://huggingface.co/stano03)
         """)
     return demo