Spaces:

stano03
/

jambogpt

Sleeping

JamboGPT Bot

Rename all models to JamboGPT Voice 1-4 and fix Kikuyu default model to facebook/mms-tts-kin

eaa9481 4 days ago

17.4 kB

	#!/usr/bin/env python3
	"""
	JamboGPT - African Language AI Voice Agent
	Multiple TTS Models for Kiswahili & Kikuyu
	"""

	import gradio as gr
	from datetime import datetime
	import torch
	from transformers import pipeline
	import numpy as np
	from scipy.io import wavfile
	import tempfile

	# Set device
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Language configurations with multiple TTS models
	LANGUAGES = {
	"Swahili": {
	"emoji": "🇰🇪",
	"speakers": "100M+",
	"region": "East Africa",
	"tts_models": [
	("Benjamin-png/swahili-mms-tts-finetuned", "🌟 JamboGPT Voice 1 (Best Quality)"),
	("facebook/mms-tts-swh", "JamboGPT Voice 2"),
	("multilingual-tts/F5-TTS-OpenBible-Swahili", "JamboGPT Voice 3"),
	("stano03/jambogpt-swahili-tts-v1", "JamboGPT Voice 4 (Custom)"),
	],
	"default_model": "Benjamin-png/swahili-mms-tts-finetuned",
	"keywords": {
	"greeting": ["habari", "jambo", "salaam", "hello", "hi"],
	"thanks": ["asante", "thank", "shukran"],
	"help": ["help", "msaada", "niweza"],
	"bye": ["kwaheri", "goodbye", "bye", "ciao"]
	},
	"responses": {
	"greeting": "Habari! Niko hapa kusaidia. Unajifunza nini leo?",
	"help": "Niweza kusaidia kwa swahili. Tafadhali niambie unajifunza nini.",
	"thanks": "Asante sana! Niko hapa kila wakati.",
	"bye": "Kwaheri! Karibu tena mwingine wakati.",
	"default": "Ndiyo, nimeelewa. Unaweza kusema zaidi?"
	}
	},
	"Kikuyu": {
	"emoji": "🇰🇪",
	"speakers": "7M",
	"region": "Kenya",
	"tts_models": [
	("facebook/mms-tts-kin", "🌟 JamboGPT Voice 1 (Best Quality)"),
	("multilingual-tts/F5-TTS-OpenBible-Kikuyu", "JamboGPT Voice 2"),
	("multilingual-tts/VITS-OpenBible-Kikuyu", "JamboGPT Voice 3"),
	],
	"default_model": "facebook/mms-tts-kin",
	"keywords": {
	"greeting": ["wĩ", "mwega", "hello", "hi", "salaam"],
	"thanks": ["mwega", "thank", "asante"],
	"help": ["help", "msaada"],
	"bye": ["rĩa", "goodbye", "bye"]
	},
	"responses": {
	"greeting": "Wĩ mwega! Nĩ ũndũ ũrĩkũ?",
	"help": "Nĩ mwega! Nĩkĩo kĩndũ kĩrĩa ũrĩ na kĩo?",
	"thanks": "Mwega muno! Nĩ mwega.",
	"bye": "Rĩa rĩu! Wĩ mwega!",
	"default": "Nĩguo mwega! Wĩ ũrĩa mwega?"
	}
	},
	"Yoruba": {
	"emoji": "🇳🇬",
	"speakers": "45M",
	"region": "West Africa",
	"tts_models": [
	("facebook/mms-tts-yor", "JamboGPT Voice 1"),
	],
	"default_model": "facebook/mms-tts-yor",
	"keywords": {
	"greeting": ["pele", "hello", "hi", "bawo"],
	"thanks": ["e ku", "thank", "ope"],
	"help": ["help", "lowo"],
	"bye": ["daabo", "goodbye", "bye"]
	},
	"responses": {
	"greeting": "Pẹlẹ o! Bawo ni o se?",
	"help": "Mo le lọwọ rẹ. Kini nkan ti o nilo?",
	"thanks": "E ku ọpẹ! Ẹ kú àrọ!",
	"bye": "Ó dáàbò! Ẹ kú ọjọ́!",
	"default": "Yoo, mo gbe e. Kini nkan ti o nilo?"
	}
	},
	"Hausa": {
	"emoji": "🇳🇬",
	"speakers": "90M",
	"region": "West Africa",
	"tts_models": [
	("facebook/mms-tts-hau", "JamboGPT Voice 1"),
	],
	"default_model": "facebook/mms-tts-hau",
	"keywords": {
	"greeting": ["sannu", "hello", "hi", "ina"],
	"thanks": ["nagode", "thank"],
	"help": ["taimaka", "help"],
	"bye": ["sai", "goodbye", "bye"]
	},
	"responses": {
	"greeting": "Sannu! Ina kwana?",
	"help": "Ina iya taimakawa ka. Me na gida!",
	"thanks": "Nagode! Na gida!",
	"bye": "Sai anjima! Jiya!",
	"default": "I na gida. Me na gida?"
	}
	},
	"Amharic": {
	"emoji": "🇪🇹",
	"speakers": "32M",
	"region": "Horn of Africa",
	"tts_models": [
	("facebook/mms-tts-amh", "JamboGPT Voice 1"),
	],
	"default_model": "facebook/mms-tts-amh",
	"keywords": {
	"greeting": ["ሰላም", "hello", "hi", "ሳላም"],
	"thanks": ["አመሰግናለሁ", "thank"],
	"help": ["ሚዛን", "help"],
	"bye": ["ደህና", "goodbye", "bye"]
	},
	"responses": {
	"greeting": "ሰላም! ደህና! ምን ያስፈልግሃል?",
	"help": "ሚዛን! እንታይ ትደልዩ?",
	"thanks": "አመሰግናለሁ! ደህና!",
	"bye": "ሰላም! ሚዛን!",
	"default": "ሙሊ። እንታይ ተወሳኺ?"
	}
	},
	"Fon": {
	"emoji": "🇧🇯",
	"speakers": "2M",
	"region": "West Africa",
	"tts_models": [
	("facebook/mms-tts-fon", "JamboGPT Voice 1"),
	],
	"default_model": "facebook/mms-tts-fon",
	"keywords": {
	"greeting": ["bonjour", "hello", "hi"],
	"thanks": ["merci", "thank"],
	"help": ["aide", "help"],
	"bye": ["au revoir", "goodbye", "bye"]
	},
	"responses": {
	"greeting": "Bonjour! Comment allez-vous?",
	"help": "Je peux vous aider. Qu'est-ce que vous voulez?",
	"thanks": "Merci beaucoup! De rien!",
	"bye": "Au revoir! À bientôt!",
	"default": "Oui, je comprends. Quoi d'autre?"
	}
	},
	"Oromo": {
	"emoji": "🇪🇹",
	"speakers": "40M",
	"region": "East Africa",
	"tts_models": [
	("facebook/mms-tts-orm", "JamboGPT Voice 1"),
	],
	"default_model": "facebook/mms-tts-orm",
	"keywords": {
	"greeting": ["salaam", "hello", "hi"],
	"thanks": ["galataa", "thank"],
	"help": ["gargaarsa", "help"],
	"bye": ["nagaa", "goodbye", "bye"]
	},
	"responses": {
	"greeting": "Salaam! Akkam jirtaa?",
	"help": "Gargaarsa nan geedaru. Maal barbaadda?",
	"thanks": "Galataa! Nagaa!",
	"bye": "Nagaa! Haa jiraatin!",
	"default": "Eeyyee, hubadha. Maal biraa?"
	}
	},
	"Somali": {
	"emoji": "🇸🇴",
	"speakers": "20M",
	"region": "East Africa",
	"tts_models": [
	("facebook/mms-tts-som", "JamboGPT Voice 1"),
	],
	"default_model": "facebook/mms-tts-som",
	"keywords": {
	"greeting": ["salaam", "hello", "hi"],
	"thanks": ["mahadsanid", "thank"],
	"help": ["caawi", "help"],
	"bye": ["nabad", "goodbye", "bye"]
	},
	"responses": {
	"greeting": "Salaam! Sidee tahay?",
	"help": "Waan kaa caawin karaa. Maxaa baahan?",
	"thanks": "Mahadsanid! Nabad!",
	"bye": "Nabad! Halkaa ku joog!",
	"default": "Hah, waan fahmay. Maxaa kale?"
	}
	},
	"Tigrinya": {
	"emoji": "🇪🇷",
	"speakers": "7M",
	"region": "Horn of Africa",
	"tts_models": [
	("facebook/mms-tts-tir", "JamboGPT Voice 1"),
	],
	"default_model": "facebook/mms-tts-tir",
	"keywords": {
	"greeting": ["ሰላም", "hello", "hi"],
	"thanks": ["አመሰግናለሁ", "thank"],
	"help": ["ሚዛን", "help"],
	"bye": ["ደህና", "goodbye", "bye"]
	},
	"responses": {
	"greeting": "ሰላም! ዴሌ ኢካ?",
	"help": "ሚዛን! እንታይ ትደልዩ?",
	"thanks": "አመሰግናለሁ! ደህና!",
	"bye": "ሰላም! ሚዛን!",
	"default": "ሙሊ። እንታይ ተወሳኺ?"
	}
	},
	"English": {
	"emoji": "🌍",
	"speakers": "1.5B",
	"region": "Global",
	"tts_models": [
	("facebook/mms-tts-eng", "JamboGPT Voice 1"),
	],
	"default_model": "facebook/mms-tts-eng",
	"keywords": {
	"greeting": ["hello", "hi", "hey", "greetings"],
	"thanks": ["thank", "thanks", "appreciate"],
	"help": ["help", "assist"],
	"bye": ["bye", "goodbye", "farewell"]
	},
	"responses": {
	"greeting": "Hello! How can I help you today?",
	"help": "I can help you with English. What would you like to know?",
	"thanks": "Thank you! Happy to help!",
	"bye": "Goodbye! See you later!",
	"default": "I understand. What else can I help you with?"
	}
	}
	}

	conversation_history = []
	model_cache = {}

	def load_tts_model(model_id):
	"""Load TTS model."""
	if model_id in model_cache:
	return model_cache[model_id]

	try:
	print(f"Loading TTS model: {model_id}")
	synthesizer = pipeline(
	"text-to-speech",
	model=model_id,
	device=device if device == "cuda" else -1
	)
	model_cache[model_id] = synthesizer
	return synthesizer
	except Exception as e:
	print(f"Error loading model {model_id}: {e}")
	return None

	def detect_intent(text, language):
	"""Detect user intent from text."""
	text_lower = text.lower()
	lang_config = LANGUAGES.get(language, {})
	keywords = lang_config.get("keywords", {})

	for intent, words in keywords.items():
	for word in words:
	if word.lower() in text_lower:
	return intent

	return "default"

	def generate_response(text, language):
	"""Generate a response based on user input."""
	try:
	lang_config = LANGUAGES.get(language, {})
	responses = lang_config.get("responses", {})

	intent = detect_intent(text, language)
	response = responses.get(intent, responses.get("default", "I understand."))

	return response
	except Exception as e:
	print(f"Error generating response: {e}")
	return "I understand. Can you say more?"

	def synthesize_speech(text, language, model_name):
	"""Convert text to speech using selected model."""
	if not text or not text.strip():
	return None

	try:
	synthesizer = load_tts_model(model_name)
	if synthesizer is None:
	return None

	print(f"Generating speech with {model_name}: {text[:50]}...")
	speech = synthesizer(text)

	audio_array = np.array(speech["audio"]).flatten()
	sample_rate = speech["sampling_rate"]

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	wavfile.write(f.name, sample_rate, (audio_array * 32767).astype(np.int16))
	temp_path = f.name

	return temp_path
	except Exception as e:
	print(f"Error synthesizing: {e}")
	return None

	def process_text_input(text, language, tts_model):
	"""Process text input: generate response -> synthesize."""
	try:
	if not text:
	return None, "Please enter some text!", ""

	response_text = generate_response(text, language)
	if response_text is None:
	return None, "Error generating response", ""

	audio_output = synthesize_speech(response_text, language, tts_model)

	conversation_history.append({
	"user": text,
	"agent": response_text,
	"language": language,
	"model": tts_model,
	"timestamp": datetime.now().strftime("%H:%M:%S")
	})

	history_text = ""
	for msg in conversation_history[-5:]:
	history_text += f"[{msg['timestamp']}] {msg['language']}\n"
	history_text += f"You: {msg['user']}\n"
	history_text += f"Agent: {msg['agent']}\n\n"

	return audio_output, response_text, history_text
	except Exception as e:
	print(f"Error processing: {e}")
	return None, f"Error: {str(e)}", ""

	def create_interface():
	"""Create the voice agent interface."""

	with gr.Blocks(
	title="JamboGPT - African Language AI Voice Agent",
	theme=gr.themes.Soft(primary_hue="purple")
	) as demo:

	gr.Markdown("""
	# 🌍 JamboGPT - African Language AI Voice Agent

	Chat with AI in 10 African languages with multiple voice options

	Swahili • Kikuyu • Yoruba • Hausa • Amharic • Fon • Oromo • Somali • Tigrinya • English
	""")

	with gr.Group():
	# Language selector
	language_choice = gr.Dropdown(
	choices=list(LANGUAGES.keys()),
	value="Swahili",
	label="Select Language",
	interactive=True
	)

	# Language info
	language_info = gr.Markdown(
	f"🇰🇪 Swahili • 100M+ speakers • East Africa"
	)

	# TTS Model selector (dynamic based on language)
	tts_model_choice = gr.Dropdown(
	choices=[("🌟 JamboGPT Voice 1 (Best Quality)", "Benjamin-png/swahili-mms-tts-finetuned"),
	("JamboGPT Voice 2", "facebook/mms-tts-swh"),
	("JamboGPT Voice 3", "multilingual-tts/F5-TTS-OpenBible-Swahili"),
	("JamboGPT Voice 4 (Custom)", "stano03/jambogpt-swahili-tts-v1")],
	value="Benjamin-png/swahili-mms-tts-finetuned",
	label="Select Voice Model",
	interactive=True
	)

	def update_language_info(language):
	if language in LANGUAGES:
	lang_data = LANGUAGES[language]
	models = lang_data.get("tts_models", [])

	# Update language info
	info_text = f"{lang_data['emoji']} {language} • {lang_data['speakers']} speakers • {lang_data['region']}"

	# Update model choices
	model_choices = models
	default_model = lang_data.get("default_model", models[0][0])

	return info_text, gr.Dropdown(choices=model_choices, value=default_model)
	return "", gr.Dropdown(choices=[])

	language_choice.change(
	update_language_info,
	inputs=language_choice,
	outputs=[language_info, tts_model_choice]
	)

	# Text input
	text_input = gr.Textbox(
	label="Type your message",
	placeholder="Type in your selected language...",
	lines=3,
	interactive=True
	)

	# Process button
	process_btn = gr.Button(
	"🎤 Generate Response",
	variant="primary",
	size="lg"
	)

	# Output section
	with gr.Group():
	agent_response = gr.Textbox(
	label="🤖 Agent Response",
	interactive=False,
	placeholder="The agent's response will appear here"
	)

	audio_output = gr.Audio(
	label="🔊 Agent Voice",
	type="filepath",
	interactive=False
	)

	history_display = gr.Textbox(
	label="📝 Conversation History",
	interactive=False,
	lines=4,
	placeholder="Your conversation history will appear here"
	)

	# Connect process button
	process_btn.click(
	fn=process_text_input,
	inputs=[text_input, language_choice, tts_model_choice],
	outputs=[audio_output, agent_response, history_display]
	)

	# Examples
	gr.Examples(
	examples=[
	["Habari, karibu sana!", "Swahili"],
	["Wĩ mwega, karibu!", "Kikuyu"],
	["Pẹlẹ o, bawo ni o se?", "Yoruba"],
	["Hello, how are you?", "English"],
	],
	inputs=[text_input, language_choice],
	outputs=[audio_output, agent_response],
	fn=process_text_input,
	cache_examples=False,
	)

	gr.Markdown("""
	---
	JamboGPT - Making AI Accessible to African Languages

	🔗 [GitHub](https://github.com/stano03/jambogpt) \| 📊 [Dataset](https://huggingface.co/datasets/stano03/jambogpt-real-dataset) \| 🤖 [Models](https://huggingface.co/stano03)
	""")

	return demo

	if __name__ == "__main__":
	print("🚀 Creating JamboGPT Voice Agent Interface...")
	demo = create_interface()

	print("=" * 50)
	print("✅ JamboGPT Voice Agent is ready!")
	print("=" * 50)

	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)