jambogpt / app.py
JamboGPT Bot
Rename all models to JamboGPT Voice 1-4 and fix Kikuyu default model to facebook/mms-tts-kin
eaa9481
#!/usr/bin/env python3
"""
JamboGPT - African Language AI Voice Agent
Multiple TTS Models for Kiswahili & Kikuyu
"""
import gradio as gr
from datetime import datetime
import torch
from transformers import pipeline
import numpy as np
from scipy.io import wavfile
import tempfile
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Language configurations with multiple TTS models
LANGUAGES = {
"Swahili": {
"emoji": "πŸ‡°πŸ‡ͺ",
"speakers": "100M+",
"region": "East Africa",
"tts_models": [
("Benjamin-png/swahili-mms-tts-finetuned", "🌟 JamboGPT Voice 1 (Best Quality)"),
("facebook/mms-tts-swh", "JamboGPT Voice 2"),
("multilingual-tts/F5-TTS-OpenBible-Swahili", "JamboGPT Voice 3"),
("stano03/jambogpt-swahili-tts-v1", "JamboGPT Voice 4 (Custom)"),
],
"default_model": "Benjamin-png/swahili-mms-tts-finetuned",
"keywords": {
"greeting": ["habari", "jambo", "salaam", "hello", "hi"],
"thanks": ["asante", "thank", "shukran"],
"help": ["help", "msaada", "niweza"],
"bye": ["kwaheri", "goodbye", "bye", "ciao"]
},
"responses": {
"greeting": "Habari! Niko hapa kusaidia. Unajifunza nini leo?",
"help": "Niweza kusaidia kwa swahili. Tafadhali niambie unajifunza nini.",
"thanks": "Asante sana! Niko hapa kila wakati.",
"bye": "Kwaheri! Karibu tena mwingine wakati.",
"default": "Ndiyo, nimeelewa. Unaweza kusema zaidi?"
}
},
"Kikuyu": {
"emoji": "πŸ‡°πŸ‡ͺ",
"speakers": "7M",
"region": "Kenya",
"tts_models": [
("facebook/mms-tts-kin", "🌟 JamboGPT Voice 1 (Best Quality)"),
("multilingual-tts/F5-TTS-OpenBible-Kikuyu", "JamboGPT Voice 2"),
("multilingual-tts/VITS-OpenBible-Kikuyu", "JamboGPT Voice 3"),
],
"default_model": "facebook/mms-tts-kin",
"keywords": {
"greeting": ["wΔ©", "mwega", "hello", "hi", "salaam"],
"thanks": ["mwega", "thank", "asante"],
"help": ["help", "msaada"],
"bye": ["rΔ©a", "goodbye", "bye"]
},
"responses": {
"greeting": "WΔ© mwega! NΔ© Ε©ndΕ© Ε©rΔ©kΕ©?",
"help": "NΔ© mwega! NΔ©kΔ©o kΔ©ndΕ© kΔ©rΔ©a Ε©rΔ© na kΔ©o?",
"thanks": "Mwega muno! NΔ© mwega.",
"bye": "RΔ©a rΔ©u! WΔ© mwega!",
"default": "NΔ©guo mwega! WΔ© Ε©rΔ©a mwega?"
}
},
"Yoruba": {
"emoji": "πŸ‡³πŸ‡¬",
"speakers": "45M",
"region": "West Africa",
"tts_models": [
("facebook/mms-tts-yor", "JamboGPT Voice 1"),
],
"default_model": "facebook/mms-tts-yor",
"keywords": {
"greeting": ["pele", "hello", "hi", "bawo"],
"thanks": ["e ku", "thank", "ope"],
"help": ["help", "lowo"],
"bye": ["daabo", "goodbye", "bye"]
},
"responses": {
"greeting": "PαΊΉlαΊΉ o! Bawo ni o se?",
"help": "Mo le lọwọ rẹ. Kini nkan ti o nilo?",
"thanks": "E ku ọpẹ! Ẹ kú àrọ!",
"bye": "Γ“ dÑàbΓ²! αΊΈ kΓΊ ọjọ́!",
"default": "Yoo, mo gbe e. Kini nkan ti o nilo?"
}
},
"Hausa": {
"emoji": "πŸ‡³πŸ‡¬",
"speakers": "90M",
"region": "West Africa",
"tts_models": [
("facebook/mms-tts-hau", "JamboGPT Voice 1"),
],
"default_model": "facebook/mms-tts-hau",
"keywords": {
"greeting": ["sannu", "hello", "hi", "ina"],
"thanks": ["nagode", "thank"],
"help": ["taimaka", "help"],
"bye": ["sai", "goodbye", "bye"]
},
"responses": {
"greeting": "Sannu! Ina kwana?",
"help": "Ina iya taimakawa ka. Me na gida!",
"thanks": "Nagode! Na gida!",
"bye": "Sai anjima! Jiya!",
"default": "I na gida. Me na gida?"
}
},
"Amharic": {
"emoji": "πŸ‡ͺπŸ‡Ή",
"speakers": "32M",
"region": "Horn of Africa",
"tts_models": [
("facebook/mms-tts-amh", "JamboGPT Voice 1"),
],
"default_model": "facebook/mms-tts-amh",
"keywords": {
"greeting": ["αˆ°αˆ‹αˆ", "hello", "hi", "αˆ³αˆ‹αˆ"],
"thanks": ["αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ", "thank"],
"help": ["αˆšα‹›αŠ•", "help"],
"bye": ["α‹°αˆ…αŠ“", "goodbye", "bye"]
},
"responses": {
"greeting": "αˆ°αˆ‹αˆ! α‹°αˆ…αŠ“! αˆαŠ• α‹«αˆ΅αˆαˆαŒαˆƒαˆ?",
"help": "αˆšα‹›αŠ•! αŠ₯αŠ•α‰³α‹­ α‰΅α‹°αˆα‹©?",
"thanks": "αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ! α‹°αˆ…αŠ“!",
"bye": "αˆ°αˆ‹αˆ! αˆšα‹›αŠ•!",
"default": "αˆ™αˆŠα’ αŠ₯αŠ•α‰³α‹­ α‰°α‹ˆαˆ³αŠΊ?"
}
},
"Fon": {
"emoji": "πŸ‡§πŸ‡―",
"speakers": "2M",
"region": "West Africa",
"tts_models": [
("facebook/mms-tts-fon", "JamboGPT Voice 1"),
],
"default_model": "facebook/mms-tts-fon",
"keywords": {
"greeting": ["bonjour", "hello", "hi"],
"thanks": ["merci", "thank"],
"help": ["aide", "help"],
"bye": ["au revoir", "goodbye", "bye"]
},
"responses": {
"greeting": "Bonjour! Comment allez-vous?",
"help": "Je peux vous aider. Qu'est-ce que vous voulez?",
"thanks": "Merci beaucoup! De rien!",
"bye": "Au revoir! Γ€ bientΓ΄t!",
"default": "Oui, je comprends. Quoi d'autre?"
}
},
"Oromo": {
"emoji": "πŸ‡ͺπŸ‡Ή",
"speakers": "40M",
"region": "East Africa",
"tts_models": [
("facebook/mms-tts-orm", "JamboGPT Voice 1"),
],
"default_model": "facebook/mms-tts-orm",
"keywords": {
"greeting": ["salaam", "hello", "hi"],
"thanks": ["galataa", "thank"],
"help": ["gargaarsa", "help"],
"bye": ["nagaa", "goodbye", "bye"]
},
"responses": {
"greeting": "Salaam! Akkam jirtaa?",
"help": "Gargaarsa nan geedaru. Maal barbaadda?",
"thanks": "Galataa! Nagaa!",
"bye": "Nagaa! Haa jiraatin!",
"default": "Eeyyee, hubadha. Maal biraa?"
}
},
"Somali": {
"emoji": "πŸ‡ΈπŸ‡΄",
"speakers": "20M",
"region": "East Africa",
"tts_models": [
("facebook/mms-tts-som", "JamboGPT Voice 1"),
],
"default_model": "facebook/mms-tts-som",
"keywords": {
"greeting": ["salaam", "hello", "hi"],
"thanks": ["mahadsanid", "thank"],
"help": ["caawi", "help"],
"bye": ["nabad", "goodbye", "bye"]
},
"responses": {
"greeting": "Salaam! Sidee tahay?",
"help": "Waan kaa caawin karaa. Maxaa baahan?",
"thanks": "Mahadsanid! Nabad!",
"bye": "Nabad! Halkaa ku joog!",
"default": "Hah, waan fahmay. Maxaa kale?"
}
},
"Tigrinya": {
"emoji": "πŸ‡ͺπŸ‡·",
"speakers": "7M",
"region": "Horn of Africa",
"tts_models": [
("facebook/mms-tts-tir", "JamboGPT Voice 1"),
],
"default_model": "facebook/mms-tts-tir",
"keywords": {
"greeting": ["αˆ°αˆ‹αˆ", "hello", "hi"],
"thanks": ["αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ", "thank"],
"help": ["αˆšα‹›αŠ•", "help"],
"bye": ["α‹°αˆ…αŠ“", "goodbye", "bye"]
},
"responses": {
"greeting": "αˆ°αˆ‹αˆ! α‹΄αˆŒ ኒካ?",
"help": "αˆšα‹›αŠ•! αŠ₯αŠ•α‰³α‹­ α‰΅α‹°αˆα‹©?",
"thanks": "αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ! α‹°αˆ…αŠ“!",
"bye": "αˆ°αˆ‹αˆ! αˆšα‹›αŠ•!",
"default": "αˆ™αˆŠα’ αŠ₯αŠ•α‰³α‹­ α‰°α‹ˆαˆ³αŠΊ?"
}
},
"English": {
"emoji": "🌍",
"speakers": "1.5B",
"region": "Global",
"tts_models": [
("facebook/mms-tts-eng", "JamboGPT Voice 1"),
],
"default_model": "facebook/mms-tts-eng",
"keywords": {
"greeting": ["hello", "hi", "hey", "greetings"],
"thanks": ["thank", "thanks", "appreciate"],
"help": ["help", "assist"],
"bye": ["bye", "goodbye", "farewell"]
},
"responses": {
"greeting": "Hello! How can I help you today?",
"help": "I can help you with English. What would you like to know?",
"thanks": "Thank you! Happy to help!",
"bye": "Goodbye! See you later!",
"default": "I understand. What else can I help you with?"
}
}
}
conversation_history = []
model_cache = {}
def load_tts_model(model_id):
"""Load TTS model."""
if model_id in model_cache:
return model_cache[model_id]
try:
print(f"Loading TTS model: {model_id}")
synthesizer = pipeline(
"text-to-speech",
model=model_id,
device=device if device == "cuda" else -1
)
model_cache[model_id] = synthesizer
return synthesizer
except Exception as e:
print(f"Error loading model {model_id}: {e}")
return None
def detect_intent(text, language):
"""Detect user intent from text."""
text_lower = text.lower()
lang_config = LANGUAGES.get(language, {})
keywords = lang_config.get("keywords", {})
for intent, words in keywords.items():
for word in words:
if word.lower() in text_lower:
return intent
return "default"
def generate_response(text, language):
"""Generate a response based on user input."""
try:
lang_config = LANGUAGES.get(language, {})
responses = lang_config.get("responses", {})
intent = detect_intent(text, language)
response = responses.get(intent, responses.get("default", "I understand."))
return response
except Exception as e:
print(f"Error generating response: {e}")
return "I understand. Can you say more?"
def synthesize_speech(text, language, model_name):
"""Convert text to speech using selected model."""
if not text or not text.strip():
return None
try:
synthesizer = load_tts_model(model_name)
if synthesizer is None:
return None
print(f"Generating speech with {model_name}: {text[:50]}...")
speech = synthesizer(text)
audio_array = np.array(speech["audio"]).flatten()
sample_rate = speech["sampling_rate"]
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
wavfile.write(f.name, sample_rate, (audio_array * 32767).astype(np.int16))
temp_path = f.name
return temp_path
except Exception as e:
print(f"Error synthesizing: {e}")
return None
def process_text_input(text, language, tts_model):
"""Process text input: generate response -> synthesize."""
try:
if not text:
return None, "Please enter some text!", ""
response_text = generate_response(text, language)
if response_text is None:
return None, "Error generating response", ""
audio_output = synthesize_speech(response_text, language, tts_model)
conversation_history.append({
"user": text,
"agent": response_text,
"language": language,
"model": tts_model,
"timestamp": datetime.now().strftime("%H:%M:%S")
})
history_text = ""
for msg in conversation_history[-5:]:
history_text += f"[{msg['timestamp']}] {msg['language']}\n"
history_text += f"You: {msg['user']}\n"
history_text += f"Agent: {msg['agent']}\n\n"
return audio_output, response_text, history_text
except Exception as e:
print(f"Error processing: {e}")
return None, f"Error: {str(e)}", ""
def create_interface():
"""Create the voice agent interface."""
with gr.Blocks(
title="JamboGPT - African Language AI Voice Agent",
theme=gr.themes.Soft(primary_hue="purple")
) as demo:
gr.Markdown("""
# 🌍 JamboGPT - African Language AI Voice Agent
**Chat with AI in 10 African languages with multiple voice options**
Swahili β€’ Kikuyu β€’ Yoruba β€’ Hausa β€’ Amharic β€’ Fon β€’ Oromo β€’ Somali β€’ Tigrinya β€’ English
""")
with gr.Group():
# Language selector
language_choice = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="Swahili",
label="Select Language",
interactive=True
)
# Language info
language_info = gr.Markdown(
f"πŸ‡°πŸ‡ͺ **Swahili** β€’ 100M+ speakers β€’ East Africa"
)
# TTS Model selector (dynamic based on language)
tts_model_choice = gr.Dropdown(
choices=[("🌟 JamboGPT Voice 1 (Best Quality)", "Benjamin-png/swahili-mms-tts-finetuned"),
("JamboGPT Voice 2", "facebook/mms-tts-swh"),
("JamboGPT Voice 3", "multilingual-tts/F5-TTS-OpenBible-Swahili"),
("JamboGPT Voice 4 (Custom)", "stano03/jambogpt-swahili-tts-v1")],
value="Benjamin-png/swahili-mms-tts-finetuned",
label="Select Voice Model",
interactive=True
)
def update_language_info(language):
if language in LANGUAGES:
lang_data = LANGUAGES[language]
models = lang_data.get("tts_models", [])
# Update language info
info_text = f"{lang_data['emoji']} **{language}** β€’ {lang_data['speakers']} speakers β€’ {lang_data['region']}"
# Update model choices
model_choices = models
default_model = lang_data.get("default_model", models[0][0])
return info_text, gr.Dropdown(choices=model_choices, value=default_model)
return "", gr.Dropdown(choices=[])
language_choice.change(
update_language_info,
inputs=language_choice,
outputs=[language_info, tts_model_choice]
)
# Text input
text_input = gr.Textbox(
label="Type your message",
placeholder="Type in your selected language...",
lines=3,
interactive=True
)
# Process button
process_btn = gr.Button(
"🎀 Generate Response",
variant="primary",
size="lg"
)
# Output section
with gr.Group():
agent_response = gr.Textbox(
label="πŸ€– Agent Response",
interactive=False,
placeholder="The agent's response will appear here"
)
audio_output = gr.Audio(
label="πŸ”Š Agent Voice",
type="filepath",
interactive=False
)
history_display = gr.Textbox(
label="πŸ“ Conversation History",
interactive=False,
lines=4,
placeholder="Your conversation history will appear here"
)
# Connect process button
process_btn.click(
fn=process_text_input,
inputs=[text_input, language_choice, tts_model_choice],
outputs=[audio_output, agent_response, history_display]
)
# Examples
gr.Examples(
examples=[
["Habari, karibu sana!", "Swahili"],
["WΔ© mwega, karibu!", "Kikuyu"],
["PαΊΉlαΊΉ o, bawo ni o se?", "Yoruba"],
["Hello, how are you?", "English"],
],
inputs=[text_input, language_choice],
outputs=[audio_output, agent_response],
fn=process_text_input,
cache_examples=False,
)
gr.Markdown("""
---
**JamboGPT** - Making AI Accessible to African Languages
πŸ”— [GitHub](https://github.com/stano03/jambogpt) | πŸ“Š [Dataset](https://huggingface.co/datasets/stano03/jambogpt-real-dataset) | πŸ€– [Models](https://huggingface.co/stano03)
""")
return demo
if __name__ == "__main__":
print("πŸš€ Creating JamboGPT Voice Agent Interface...")
demo = create_interface()
print("=" * 50)
print("βœ… JamboGPT Voice Agent is ready!")
print("=" * 50)
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
)