JamboGPT Bot
Rename all models to JamboGPT Voice 1-4 and fix Kikuyu default model to facebook/mms-tts-kin
eaa9481 | #!/usr/bin/env python3 | |
| """ | |
| JamboGPT - African Language AI Voice Agent | |
| Multiple TTS Models for Kiswahili & Kikuyu | |
| """ | |
| import gradio as gr | |
| from datetime import datetime | |
| import torch | |
| from transformers import pipeline | |
| import numpy as np | |
| from scipy.io import wavfile | |
| import tempfile | |
| # Set device | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Language configurations with multiple TTS models | |
| LANGUAGES = { | |
| "Swahili": { | |
| "emoji": "π°πͺ", | |
| "speakers": "100M+", | |
| "region": "East Africa", | |
| "tts_models": [ | |
| ("Benjamin-png/swahili-mms-tts-finetuned", "π JamboGPT Voice 1 (Best Quality)"), | |
| ("facebook/mms-tts-swh", "JamboGPT Voice 2"), | |
| ("multilingual-tts/F5-TTS-OpenBible-Swahili", "JamboGPT Voice 3"), | |
| ("stano03/jambogpt-swahili-tts-v1", "JamboGPT Voice 4 (Custom)"), | |
| ], | |
| "default_model": "Benjamin-png/swahili-mms-tts-finetuned", | |
| "keywords": { | |
| "greeting": ["habari", "jambo", "salaam", "hello", "hi"], | |
| "thanks": ["asante", "thank", "shukran"], | |
| "help": ["help", "msaada", "niweza"], | |
| "bye": ["kwaheri", "goodbye", "bye", "ciao"] | |
| }, | |
| "responses": { | |
| "greeting": "Habari! Niko hapa kusaidia. Unajifunza nini leo?", | |
| "help": "Niweza kusaidia kwa swahili. Tafadhali niambie unajifunza nini.", | |
| "thanks": "Asante sana! Niko hapa kila wakati.", | |
| "bye": "Kwaheri! Karibu tena mwingine wakati.", | |
| "default": "Ndiyo, nimeelewa. Unaweza kusema zaidi?" | |
| } | |
| }, | |
| "Kikuyu": { | |
| "emoji": "π°πͺ", | |
| "speakers": "7M", | |
| "region": "Kenya", | |
| "tts_models": [ | |
| ("facebook/mms-tts-kin", "π JamboGPT Voice 1 (Best Quality)"), | |
| ("multilingual-tts/F5-TTS-OpenBible-Kikuyu", "JamboGPT Voice 2"), | |
| ("multilingual-tts/VITS-OpenBible-Kikuyu", "JamboGPT Voice 3"), | |
| ], | |
| "default_model": "facebook/mms-tts-kin", | |
| "keywords": { | |
| "greeting": ["wΔ©", "mwega", "hello", "hi", "salaam"], | |
| "thanks": ["mwega", "thank", "asante"], | |
| "help": ["help", "msaada"], | |
| "bye": ["rΔ©a", "goodbye", "bye"] | |
| }, | |
| "responses": { | |
| "greeting": "WΔ© mwega! NΔ© Ε©ndΕ© Ε©rΔ©kΕ©?", | |
| "help": "NΔ© mwega! NΔ©kΔ©o kΔ©ndΕ© kΔ©rΔ©a Ε©rΔ© na kΔ©o?", | |
| "thanks": "Mwega muno! NΔ© mwega.", | |
| "bye": "RΔ©a rΔ©u! WΔ© mwega!", | |
| "default": "NΔ©guo mwega! WΔ© Ε©rΔ©a mwega?" | |
| } | |
| }, | |
| "Yoruba": { | |
| "emoji": "π³π¬", | |
| "speakers": "45M", | |
| "region": "West Africa", | |
| "tts_models": [ | |
| ("facebook/mms-tts-yor", "JamboGPT Voice 1"), | |
| ], | |
| "default_model": "facebook/mms-tts-yor", | |
| "keywords": { | |
| "greeting": ["pele", "hello", "hi", "bawo"], | |
| "thanks": ["e ku", "thank", "ope"], | |
| "help": ["help", "lowo"], | |
| "bye": ["daabo", "goodbye", "bye"] | |
| }, | |
| "responses": { | |
| "greeting": "PαΊΉlαΊΉ o! Bawo ni o se?", | |
| "help": "Mo le lα»wα» rαΊΉ. Kini nkan ti o nilo?", | |
| "thanks": "E ku α»pαΊΉ! αΊΈ kΓΊ Γ rα»!", | |
| "bye": "Γ dÑà bΓ²! αΊΈ kΓΊ α»jα»Μ!", | |
| "default": "Yoo, mo gbe e. Kini nkan ti o nilo?" | |
| } | |
| }, | |
| "Hausa": { | |
| "emoji": "π³π¬", | |
| "speakers": "90M", | |
| "region": "West Africa", | |
| "tts_models": [ | |
| ("facebook/mms-tts-hau", "JamboGPT Voice 1"), | |
| ], | |
| "default_model": "facebook/mms-tts-hau", | |
| "keywords": { | |
| "greeting": ["sannu", "hello", "hi", "ina"], | |
| "thanks": ["nagode", "thank"], | |
| "help": ["taimaka", "help"], | |
| "bye": ["sai", "goodbye", "bye"] | |
| }, | |
| "responses": { | |
| "greeting": "Sannu! Ina kwana?", | |
| "help": "Ina iya taimakawa ka. Me na gida!", | |
| "thanks": "Nagode! Na gida!", | |
| "bye": "Sai anjima! Jiya!", | |
| "default": "I na gida. Me na gida?" | |
| } | |
| }, | |
| "Amharic": { | |
| "emoji": "πͺπΉ", | |
| "speakers": "32M", | |
| "region": "Horn of Africa", | |
| "tts_models": [ | |
| ("facebook/mms-tts-amh", "JamboGPT Voice 1"), | |
| ], | |
| "default_model": "facebook/mms-tts-amh", | |
| "keywords": { | |
| "greeting": ["α°αα", "hello", "hi", "α³αα"], | |
| "thanks": ["α αα°αααα", "thank"], | |
| "help": ["ααα", "help"], | |
| "bye": ["α°α α", "goodbye", "bye"] | |
| }, | |
| "responses": { | |
| "greeting": "α°αα! α°α α! αα α«α΅ααααα?", | |
| "help": "ααα! α₯αα³α α΅α°αα©?", | |
| "thanks": "α αα°αααα! α°α α!", | |
| "bye": "α°αα! ααα!", | |
| "default": "ααα’ α₯αα³α α°αα³αΊ?" | |
| } | |
| }, | |
| "Fon": { | |
| "emoji": "π§π―", | |
| "speakers": "2M", | |
| "region": "West Africa", | |
| "tts_models": [ | |
| ("facebook/mms-tts-fon", "JamboGPT Voice 1"), | |
| ], | |
| "default_model": "facebook/mms-tts-fon", | |
| "keywords": { | |
| "greeting": ["bonjour", "hello", "hi"], | |
| "thanks": ["merci", "thank"], | |
| "help": ["aide", "help"], | |
| "bye": ["au revoir", "goodbye", "bye"] | |
| }, | |
| "responses": { | |
| "greeting": "Bonjour! Comment allez-vous?", | |
| "help": "Je peux vous aider. Qu'est-ce que vous voulez?", | |
| "thanks": "Merci beaucoup! De rien!", | |
| "bye": "Au revoir! Γ bientΓ΄t!", | |
| "default": "Oui, je comprends. Quoi d'autre?" | |
| } | |
| }, | |
| "Oromo": { | |
| "emoji": "πͺπΉ", | |
| "speakers": "40M", | |
| "region": "East Africa", | |
| "tts_models": [ | |
| ("facebook/mms-tts-orm", "JamboGPT Voice 1"), | |
| ], | |
| "default_model": "facebook/mms-tts-orm", | |
| "keywords": { | |
| "greeting": ["salaam", "hello", "hi"], | |
| "thanks": ["galataa", "thank"], | |
| "help": ["gargaarsa", "help"], | |
| "bye": ["nagaa", "goodbye", "bye"] | |
| }, | |
| "responses": { | |
| "greeting": "Salaam! Akkam jirtaa?", | |
| "help": "Gargaarsa nan geedaru. Maal barbaadda?", | |
| "thanks": "Galataa! Nagaa!", | |
| "bye": "Nagaa! Haa jiraatin!", | |
| "default": "Eeyyee, hubadha. Maal biraa?" | |
| } | |
| }, | |
| "Somali": { | |
| "emoji": "πΈπ΄", | |
| "speakers": "20M", | |
| "region": "East Africa", | |
| "tts_models": [ | |
| ("facebook/mms-tts-som", "JamboGPT Voice 1"), | |
| ], | |
| "default_model": "facebook/mms-tts-som", | |
| "keywords": { | |
| "greeting": ["salaam", "hello", "hi"], | |
| "thanks": ["mahadsanid", "thank"], | |
| "help": ["caawi", "help"], | |
| "bye": ["nabad", "goodbye", "bye"] | |
| }, | |
| "responses": { | |
| "greeting": "Salaam! Sidee tahay?", | |
| "help": "Waan kaa caawin karaa. Maxaa baahan?", | |
| "thanks": "Mahadsanid! Nabad!", | |
| "bye": "Nabad! Halkaa ku joog!", | |
| "default": "Hah, waan fahmay. Maxaa kale?" | |
| } | |
| }, | |
| "Tigrinya": { | |
| "emoji": "πͺπ·", | |
| "speakers": "7M", | |
| "region": "Horn of Africa", | |
| "tts_models": [ | |
| ("facebook/mms-tts-tir", "JamboGPT Voice 1"), | |
| ], | |
| "default_model": "facebook/mms-tts-tir", | |
| "keywords": { | |
| "greeting": ["α°αα", "hello", "hi"], | |
| "thanks": ["α αα°αααα", "thank"], | |
| "help": ["ααα", "help"], | |
| "bye": ["α°α α", "goodbye", "bye"] | |
| }, | |
| "responses": { | |
| "greeting": "α°αα! α΄α α’α«?", | |
| "help": "ααα! α₯αα³α α΅α°αα©?", | |
| "thanks": "α αα°αααα! α°α α!", | |
| "bye": "α°αα! ααα!", | |
| "default": "ααα’ α₯αα³α α°αα³αΊ?" | |
| } | |
| }, | |
| "English": { | |
| "emoji": "π", | |
| "speakers": "1.5B", | |
| "region": "Global", | |
| "tts_models": [ | |
| ("facebook/mms-tts-eng", "JamboGPT Voice 1"), | |
| ], | |
| "default_model": "facebook/mms-tts-eng", | |
| "keywords": { | |
| "greeting": ["hello", "hi", "hey", "greetings"], | |
| "thanks": ["thank", "thanks", "appreciate"], | |
| "help": ["help", "assist"], | |
| "bye": ["bye", "goodbye", "farewell"] | |
| }, | |
| "responses": { | |
| "greeting": "Hello! How can I help you today?", | |
| "help": "I can help you with English. What would you like to know?", | |
| "thanks": "Thank you! Happy to help!", | |
| "bye": "Goodbye! See you later!", | |
| "default": "I understand. What else can I help you with?" | |
| } | |
| } | |
| } | |
| conversation_history = [] | |
| model_cache = {} | |
| def load_tts_model(model_id): | |
| """Load TTS model.""" | |
| if model_id in model_cache: | |
| return model_cache[model_id] | |
| try: | |
| print(f"Loading TTS model: {model_id}") | |
| synthesizer = pipeline( | |
| "text-to-speech", | |
| model=model_id, | |
| device=device if device == "cuda" else -1 | |
| ) | |
| model_cache[model_id] = synthesizer | |
| return synthesizer | |
| except Exception as e: | |
| print(f"Error loading model {model_id}: {e}") | |
| return None | |
| def detect_intent(text, language): | |
| """Detect user intent from text.""" | |
| text_lower = text.lower() | |
| lang_config = LANGUAGES.get(language, {}) | |
| keywords = lang_config.get("keywords", {}) | |
| for intent, words in keywords.items(): | |
| for word in words: | |
| if word.lower() in text_lower: | |
| return intent | |
| return "default" | |
| def generate_response(text, language): | |
| """Generate a response based on user input.""" | |
| try: | |
| lang_config = LANGUAGES.get(language, {}) | |
| responses = lang_config.get("responses", {}) | |
| intent = detect_intent(text, language) | |
| response = responses.get(intent, responses.get("default", "I understand.")) | |
| return response | |
| except Exception as e: | |
| print(f"Error generating response: {e}") | |
| return "I understand. Can you say more?" | |
| def synthesize_speech(text, language, model_name): | |
| """Convert text to speech using selected model.""" | |
| if not text or not text.strip(): | |
| return None | |
| try: | |
| synthesizer = load_tts_model(model_name) | |
| if synthesizer is None: | |
| return None | |
| print(f"Generating speech with {model_name}: {text[:50]}...") | |
| speech = synthesizer(text) | |
| audio_array = np.array(speech["audio"]).flatten() | |
| sample_rate = speech["sampling_rate"] | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
| wavfile.write(f.name, sample_rate, (audio_array * 32767).astype(np.int16)) | |
| temp_path = f.name | |
| return temp_path | |
| except Exception as e: | |
| print(f"Error synthesizing: {e}") | |
| return None | |
| def process_text_input(text, language, tts_model): | |
| """Process text input: generate response -> synthesize.""" | |
| try: | |
| if not text: | |
| return None, "Please enter some text!", "" | |
| response_text = generate_response(text, language) | |
| if response_text is None: | |
| return None, "Error generating response", "" | |
| audio_output = synthesize_speech(response_text, language, tts_model) | |
| conversation_history.append({ | |
| "user": text, | |
| "agent": response_text, | |
| "language": language, | |
| "model": tts_model, | |
| "timestamp": datetime.now().strftime("%H:%M:%S") | |
| }) | |
| history_text = "" | |
| for msg in conversation_history[-5:]: | |
| history_text += f"[{msg['timestamp']}] {msg['language']}\n" | |
| history_text += f"You: {msg['user']}\n" | |
| history_text += f"Agent: {msg['agent']}\n\n" | |
| return audio_output, response_text, history_text | |
| except Exception as e: | |
| print(f"Error processing: {e}") | |
| return None, f"Error: {str(e)}", "" | |
| def create_interface(): | |
| """Create the voice agent interface.""" | |
| with gr.Blocks( | |
| title="JamboGPT - African Language AI Voice Agent", | |
| theme=gr.themes.Soft(primary_hue="purple") | |
| ) as demo: | |
| gr.Markdown(""" | |
| # π JamboGPT - African Language AI Voice Agent | |
| **Chat with AI in 10 African languages with multiple voice options** | |
| Swahili β’ Kikuyu β’ Yoruba β’ Hausa β’ Amharic β’ Fon β’ Oromo β’ Somali β’ Tigrinya β’ English | |
| """) | |
| with gr.Group(): | |
| # Language selector | |
| language_choice = gr.Dropdown( | |
| choices=list(LANGUAGES.keys()), | |
| value="Swahili", | |
| label="Select Language", | |
| interactive=True | |
| ) | |
| # Language info | |
| language_info = gr.Markdown( | |
| f"π°πͺ **Swahili** β’ 100M+ speakers β’ East Africa" | |
| ) | |
| # TTS Model selector (dynamic based on language) | |
| tts_model_choice = gr.Dropdown( | |
| choices=[("π JamboGPT Voice 1 (Best Quality)", "Benjamin-png/swahili-mms-tts-finetuned"), | |
| ("JamboGPT Voice 2", "facebook/mms-tts-swh"), | |
| ("JamboGPT Voice 3", "multilingual-tts/F5-TTS-OpenBible-Swahili"), | |
| ("JamboGPT Voice 4 (Custom)", "stano03/jambogpt-swahili-tts-v1")], | |
| value="Benjamin-png/swahili-mms-tts-finetuned", | |
| label="Select Voice Model", | |
| interactive=True | |
| ) | |
| def update_language_info(language): | |
| if language in LANGUAGES: | |
| lang_data = LANGUAGES[language] | |
| models = lang_data.get("tts_models", []) | |
| # Update language info | |
| info_text = f"{lang_data['emoji']} **{language}** β’ {lang_data['speakers']} speakers β’ {lang_data['region']}" | |
| # Update model choices | |
| model_choices = models | |
| default_model = lang_data.get("default_model", models[0][0]) | |
| return info_text, gr.Dropdown(choices=model_choices, value=default_model) | |
| return "", gr.Dropdown(choices=[]) | |
| language_choice.change( | |
| update_language_info, | |
| inputs=language_choice, | |
| outputs=[language_info, tts_model_choice] | |
| ) | |
| # Text input | |
| text_input = gr.Textbox( | |
| label="Type your message", | |
| placeholder="Type in your selected language...", | |
| lines=3, | |
| interactive=True | |
| ) | |
| # Process button | |
| process_btn = gr.Button( | |
| "π€ Generate Response", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| # Output section | |
| with gr.Group(): | |
| agent_response = gr.Textbox( | |
| label="π€ Agent Response", | |
| interactive=False, | |
| placeholder="The agent's response will appear here" | |
| ) | |
| audio_output = gr.Audio( | |
| label="π Agent Voice", | |
| type="filepath", | |
| interactive=False | |
| ) | |
| history_display = gr.Textbox( | |
| label="π Conversation History", | |
| interactive=False, | |
| lines=4, | |
| placeholder="Your conversation history will appear here" | |
| ) | |
| # Connect process button | |
| process_btn.click( | |
| fn=process_text_input, | |
| inputs=[text_input, language_choice, tts_model_choice], | |
| outputs=[audio_output, agent_response, history_display] | |
| ) | |
| # Examples | |
| gr.Examples( | |
| examples=[ | |
| ["Habari, karibu sana!", "Swahili"], | |
| ["WΔ© mwega, karibu!", "Kikuyu"], | |
| ["PαΊΉlαΊΉ o, bawo ni o se?", "Yoruba"], | |
| ["Hello, how are you?", "English"], | |
| ], | |
| inputs=[text_input, language_choice], | |
| outputs=[audio_output, agent_response], | |
| fn=process_text_input, | |
| cache_examples=False, | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| **JamboGPT** - Making AI Accessible to African Languages | |
| π [GitHub](https://github.com/stano03/jambogpt) | π [Dataset](https://huggingface.co/datasets/stano03/jambogpt-real-dataset) | π€ [Models](https://huggingface.co/stano03) | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| print("π Creating JamboGPT Voice Agent Interface...") | |
| demo = create_interface() | |
| print("=" * 50) | |
| print("β JamboGPT Voice Agent is ready!") | |
| print("=" * 50) | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True | |
| ) | |