#!/usr/bin/env python3 """ JamboGPT - African Language AI Voice Agent Multiple TTS Models for Kiswahili & Kikuyu """ import gradio as gr from datetime import datetime import torch from transformers import pipeline import numpy as np from scipy.io import wavfile import tempfile # Set device device = "cuda" if torch.cuda.is_available() else "cpu" # Language configurations with multiple TTS models LANGUAGES = { "Swahili": { "emoji": "πŸ‡°πŸ‡ͺ", "speakers": "100M+", "region": "East Africa", "tts_models": [ ("Benjamin-png/swahili-mms-tts-finetuned", "🌟 JamboGPT Voice 1 (Best Quality)"), ("facebook/mms-tts-swh", "JamboGPT Voice 2"), ("multilingual-tts/F5-TTS-OpenBible-Swahili", "JamboGPT Voice 3"), ("stano03/jambogpt-swahili-tts-v1", "JamboGPT Voice 4 (Custom)"), ], "default_model": "Benjamin-png/swahili-mms-tts-finetuned", "keywords": { "greeting": ["habari", "jambo", "salaam", "hello", "hi"], "thanks": ["asante", "thank", "shukran"], "help": ["help", "msaada", "niweza"], "bye": ["kwaheri", "goodbye", "bye", "ciao"] }, "responses": { "greeting": "Habari! Niko hapa kusaidia. Unajifunza nini leo?", "help": "Niweza kusaidia kwa swahili. Tafadhali niambie unajifunza nini.", "thanks": "Asante sana! Niko hapa kila wakati.", "bye": "Kwaheri! Karibu tena mwingine wakati.", "default": "Ndiyo, nimeelewa. Unaweza kusema zaidi?" } }, "Kikuyu": { "emoji": "πŸ‡°πŸ‡ͺ", "speakers": "7M", "region": "Kenya", "tts_models": [ ("facebook/mms-tts-kin", "🌟 JamboGPT Voice 1 (Best Quality)"), ("multilingual-tts/F5-TTS-OpenBible-Kikuyu", "JamboGPT Voice 2"), ("multilingual-tts/VITS-OpenBible-Kikuyu", "JamboGPT Voice 3"), ], "default_model": "facebook/mms-tts-kin", "keywords": { "greeting": ["wΔ©", "mwega", "hello", "hi", "salaam"], "thanks": ["mwega", "thank", "asante"], "help": ["help", "msaada"], "bye": ["rΔ©a", "goodbye", "bye"] }, "responses": { "greeting": "WΔ© mwega! NΔ© Ε©ndΕ© Ε©rΔ©kΕ©?", "help": "NΔ© mwega! NΔ©kΔ©o kΔ©ndΕ© kΔ©rΔ©a Ε©rΔ© na kΔ©o?", "thanks": "Mwega muno! NΔ© mwega.", "bye": "RΔ©a rΔ©u! WΔ© mwega!", "default": "NΔ©guo mwega! WΔ© Ε©rΔ©a mwega?" } }, "Yoruba": { "emoji": "πŸ‡³πŸ‡¬", "speakers": "45M", "region": "West Africa", "tts_models": [ ("facebook/mms-tts-yor", "JamboGPT Voice 1"), ], "default_model": "facebook/mms-tts-yor", "keywords": { "greeting": ["pele", "hello", "hi", "bawo"], "thanks": ["e ku", "thank", "ope"], "help": ["help", "lowo"], "bye": ["daabo", "goodbye", "bye"] }, "responses": { "greeting": "PαΊΉlαΊΉ o! Bawo ni o se?", "help": "Mo le lọwọ rαΊΉ. Kini nkan ti o nilo?", "thanks": "E ku ọpαΊΉ! αΊΈ kΓΊ Γ rọ!", "bye": "Γ“ dÑàbΓ²! αΊΈ kΓΊ ọjọ́!", "default": "Yoo, mo gbe e. Kini nkan ti o nilo?" } }, "Hausa": { "emoji": "πŸ‡³πŸ‡¬", "speakers": "90M", "region": "West Africa", "tts_models": [ ("facebook/mms-tts-hau", "JamboGPT Voice 1"), ], "default_model": "facebook/mms-tts-hau", "keywords": { "greeting": ["sannu", "hello", "hi", "ina"], "thanks": ["nagode", "thank"], "help": ["taimaka", "help"], "bye": ["sai", "goodbye", "bye"] }, "responses": { "greeting": "Sannu! Ina kwana?", "help": "Ina iya taimakawa ka. Me na gida!", "thanks": "Nagode! Na gida!", "bye": "Sai anjima! Jiya!", "default": "I na gida. Me na gida?" } }, "Amharic": { "emoji": "πŸ‡ͺπŸ‡Ή", "speakers": "32M", "region": "Horn of Africa", "tts_models": [ ("facebook/mms-tts-amh", "JamboGPT Voice 1"), ], "default_model": "facebook/mms-tts-amh", "keywords": { "greeting": ["αˆ°αˆ‹αˆ", "hello", "hi", "αˆ³αˆ‹αˆ"], "thanks": ["αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ", "thank"], "help": ["αˆšα‹›αŠ•", "help"], "bye": ["α‹°αˆ…αŠ“", "goodbye", "bye"] }, "responses": { "greeting": "αˆ°αˆ‹αˆ! α‹°αˆ…αŠ“! αˆαŠ• α‹«αˆ΅αˆαˆαŒαˆƒαˆ?", "help": "αˆšα‹›αŠ•! αŠ₯αŠ•α‰³α‹­ α‰΅α‹°αˆα‹©?", "thanks": "αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ! α‹°αˆ…αŠ“!", "bye": "αˆ°αˆ‹αˆ! αˆšα‹›αŠ•!", "default": "αˆ™αˆŠα’ αŠ₯αŠ•α‰³α‹­ α‰°α‹ˆαˆ³αŠΊ?" } }, "Fon": { "emoji": "πŸ‡§πŸ‡―", "speakers": "2M", "region": "West Africa", "tts_models": [ ("facebook/mms-tts-fon", "JamboGPT Voice 1"), ], "default_model": "facebook/mms-tts-fon", "keywords": { "greeting": ["bonjour", "hello", "hi"], "thanks": ["merci", "thank"], "help": ["aide", "help"], "bye": ["au revoir", "goodbye", "bye"] }, "responses": { "greeting": "Bonjour! Comment allez-vous?", "help": "Je peux vous aider. Qu'est-ce que vous voulez?", "thanks": "Merci beaucoup! De rien!", "bye": "Au revoir! Γ€ bientΓ΄t!", "default": "Oui, je comprends. Quoi d'autre?" } }, "Oromo": { "emoji": "πŸ‡ͺπŸ‡Ή", "speakers": "40M", "region": "East Africa", "tts_models": [ ("facebook/mms-tts-orm", "JamboGPT Voice 1"), ], "default_model": "facebook/mms-tts-orm", "keywords": { "greeting": ["salaam", "hello", "hi"], "thanks": ["galataa", "thank"], "help": ["gargaarsa", "help"], "bye": ["nagaa", "goodbye", "bye"] }, "responses": { "greeting": "Salaam! Akkam jirtaa?", "help": "Gargaarsa nan geedaru. Maal barbaadda?", "thanks": "Galataa! Nagaa!", "bye": "Nagaa! Haa jiraatin!", "default": "Eeyyee, hubadha. Maal biraa?" } }, "Somali": { "emoji": "πŸ‡ΈπŸ‡΄", "speakers": "20M", "region": "East Africa", "tts_models": [ ("facebook/mms-tts-som", "JamboGPT Voice 1"), ], "default_model": "facebook/mms-tts-som", "keywords": { "greeting": ["salaam", "hello", "hi"], "thanks": ["mahadsanid", "thank"], "help": ["caawi", "help"], "bye": ["nabad", "goodbye", "bye"] }, "responses": { "greeting": "Salaam! Sidee tahay?", "help": "Waan kaa caawin karaa. Maxaa baahan?", "thanks": "Mahadsanid! Nabad!", "bye": "Nabad! Halkaa ku joog!", "default": "Hah, waan fahmay. Maxaa kale?" } }, "Tigrinya": { "emoji": "πŸ‡ͺπŸ‡·", "speakers": "7M", "region": "Horn of Africa", "tts_models": [ ("facebook/mms-tts-tir", "JamboGPT Voice 1"), ], "default_model": "facebook/mms-tts-tir", "keywords": { "greeting": ["αˆ°αˆ‹αˆ", "hello", "hi"], "thanks": ["αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ", "thank"], "help": ["αˆšα‹›αŠ•", "help"], "bye": ["α‹°αˆ…αŠ“", "goodbye", "bye"] }, "responses": { "greeting": "αˆ°αˆ‹αˆ! α‹΄αˆŒ ኒካ?", "help": "αˆšα‹›αŠ•! αŠ₯αŠ•α‰³α‹­ α‰΅α‹°αˆα‹©?", "thanks": "αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ! α‹°αˆ…αŠ“!", "bye": "αˆ°αˆ‹αˆ! αˆšα‹›αŠ•!", "default": "αˆ™αˆŠα’ αŠ₯αŠ•α‰³α‹­ α‰°α‹ˆαˆ³αŠΊ?" } }, "English": { "emoji": "🌍", "speakers": "1.5B", "region": "Global", "tts_models": [ ("facebook/mms-tts-eng", "JamboGPT Voice 1"), ], "default_model": "facebook/mms-tts-eng", "keywords": { "greeting": ["hello", "hi", "hey", "greetings"], "thanks": ["thank", "thanks", "appreciate"], "help": ["help", "assist"], "bye": ["bye", "goodbye", "farewell"] }, "responses": { "greeting": "Hello! How can I help you today?", "help": "I can help you with English. What would you like to know?", "thanks": "Thank you! Happy to help!", "bye": "Goodbye! See you later!", "default": "I understand. What else can I help you with?" } } } conversation_history = [] model_cache = {} def load_tts_model(model_id): """Load TTS model.""" if model_id in model_cache: return model_cache[model_id] try: print(f"Loading TTS model: {model_id}") synthesizer = pipeline( "text-to-speech", model=model_id, device=device if device == "cuda" else -1 ) model_cache[model_id] = synthesizer return synthesizer except Exception as e: print(f"Error loading model {model_id}: {e}") return None def detect_intent(text, language): """Detect user intent from text.""" text_lower = text.lower() lang_config = LANGUAGES.get(language, {}) keywords = lang_config.get("keywords", {}) for intent, words in keywords.items(): for word in words: if word.lower() in text_lower: return intent return "default" def generate_response(text, language): """Generate a response based on user input.""" try: lang_config = LANGUAGES.get(language, {}) responses = lang_config.get("responses", {}) intent = detect_intent(text, language) response = responses.get(intent, responses.get("default", "I understand.")) return response except Exception as e: print(f"Error generating response: {e}") return "I understand. Can you say more?" def synthesize_speech(text, language, model_name): """Convert text to speech using selected model.""" if not text or not text.strip(): return None try: synthesizer = load_tts_model(model_name) if synthesizer is None: return None print(f"Generating speech with {model_name}: {text[:50]}...") speech = synthesizer(text) audio_array = np.array(speech["audio"]).flatten() sample_rate = speech["sampling_rate"] with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: wavfile.write(f.name, sample_rate, (audio_array * 32767).astype(np.int16)) temp_path = f.name return temp_path except Exception as e: print(f"Error synthesizing: {e}") return None def process_text_input(text, language, tts_model): """Process text input: generate response -> synthesize.""" try: if not text: return None, "Please enter some text!", "" response_text = generate_response(text, language) if response_text is None: return None, "Error generating response", "" audio_output = synthesize_speech(response_text, language, tts_model) conversation_history.append({ "user": text, "agent": response_text, "language": language, "model": tts_model, "timestamp": datetime.now().strftime("%H:%M:%S") }) history_text = "" for msg in conversation_history[-5:]: history_text += f"[{msg['timestamp']}] {msg['language']}\n" history_text += f"You: {msg['user']}\n" history_text += f"Agent: {msg['agent']}\n\n" return audio_output, response_text, history_text except Exception as e: print(f"Error processing: {e}") return None, f"Error: {str(e)}", "" def create_interface(): """Create the voice agent interface.""" with gr.Blocks( title="JamboGPT - African Language AI Voice Agent", theme=gr.themes.Soft(primary_hue="purple") ) as demo: gr.Markdown(""" # 🌍 JamboGPT - African Language AI Voice Agent **Chat with AI in 10 African languages with multiple voice options** Swahili β€’ Kikuyu β€’ Yoruba β€’ Hausa β€’ Amharic β€’ Fon β€’ Oromo β€’ Somali β€’ Tigrinya β€’ English """) with gr.Group(): # Language selector language_choice = gr.Dropdown( choices=list(LANGUAGES.keys()), value="Swahili", label="Select Language", interactive=True ) # Language info language_info = gr.Markdown( f"πŸ‡°πŸ‡ͺ **Swahili** β€’ 100M+ speakers β€’ East Africa" ) # TTS Model selector (dynamic based on language) tts_model_choice = gr.Dropdown( choices=[("🌟 JamboGPT Voice 1 (Best Quality)", "Benjamin-png/swahili-mms-tts-finetuned"), ("JamboGPT Voice 2", "facebook/mms-tts-swh"), ("JamboGPT Voice 3", "multilingual-tts/F5-TTS-OpenBible-Swahili"), ("JamboGPT Voice 4 (Custom)", "stano03/jambogpt-swahili-tts-v1")], value="Benjamin-png/swahili-mms-tts-finetuned", label="Select Voice Model", interactive=True ) def update_language_info(language): if language in LANGUAGES: lang_data = LANGUAGES[language] models = lang_data.get("tts_models", []) # Update language info info_text = f"{lang_data['emoji']} **{language}** β€’ {lang_data['speakers']} speakers β€’ {lang_data['region']}" # Update model choices model_choices = models default_model = lang_data.get("default_model", models[0][0]) return info_text, gr.Dropdown(choices=model_choices, value=default_model) return "", gr.Dropdown(choices=[]) language_choice.change( update_language_info, inputs=language_choice, outputs=[language_info, tts_model_choice] ) # Text input text_input = gr.Textbox( label="Type your message", placeholder="Type in your selected language...", lines=3, interactive=True ) # Process button process_btn = gr.Button( "🎀 Generate Response", variant="primary", size="lg" ) # Output section with gr.Group(): agent_response = gr.Textbox( label="πŸ€– Agent Response", interactive=False, placeholder="The agent's response will appear here" ) audio_output = gr.Audio( label="πŸ”Š Agent Voice", type="filepath", interactive=False ) history_display = gr.Textbox( label="πŸ“ Conversation History", interactive=False, lines=4, placeholder="Your conversation history will appear here" ) # Connect process button process_btn.click( fn=process_text_input, inputs=[text_input, language_choice, tts_model_choice], outputs=[audio_output, agent_response, history_display] ) # Examples gr.Examples( examples=[ ["Habari, karibu sana!", "Swahili"], ["WΔ© mwega, karibu!", "Kikuyu"], ["PαΊΉlαΊΉ o, bawo ni o se?", "Yoruba"], ["Hello, how are you?", "English"], ], inputs=[text_input, language_choice], outputs=[audio_output, agent_response], fn=process_text_input, cache_examples=False, ) gr.Markdown(""" --- **JamboGPT** - Making AI Accessible to African Languages πŸ”— [GitHub](https://github.com/stano03/jambogpt) | πŸ“Š [Dataset](https://huggingface.co/datasets/stano03/jambogpt-real-dataset) | πŸ€– [Models](https://huggingface.co/stano03) """) return demo if __name__ == "__main__": print("πŸš€ Creating JamboGPT Voice Agent Interface...") demo = create_interface() print("=" * 50) print("βœ… JamboGPT Voice Agent is ready!") print("=" * 50) demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True )