JamboGPT Bot commited on
Commit
6ddb91c
Β·
1 Parent(s): 504b4e1

Add multiple TTS models for Kiswahili and Kikuyu with voice selection

Browse files
Files changed (1) hide show
  1. app.py +87 -40
app.py CHANGED
@@ -1,7 +1,7 @@
1
  #!/usr/bin/env python3
2
  """
3
  JamboGPT - African Language AI Voice Agent
4
- Using Free Hugging Face TTS Models
5
  """
6
 
7
  import gradio as gr
@@ -15,13 +15,19 @@ import tempfile
15
  # Set device
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
 
18
- # Language configurations with free HF TTS models
19
  LANGUAGES = {
20
  "Swahili": {
21
  "emoji": "πŸ‡°πŸ‡ͺ",
22
  "speakers": "100M+",
23
  "region": "East Africa",
24
- "tts_model": "facebook/mms-tts-swh",
 
 
 
 
 
 
25
  "keywords": {
26
  "greeting": ["habari", "jambo", "salaam", "hello", "hi"],
27
  "thanks": ["asante", "thank", "shukran"],
@@ -40,7 +46,12 @@ LANGUAGES = {
40
  "emoji": "πŸ‡°πŸ‡ͺ",
41
  "speakers": "7M",
42
  "region": "Kenya",
43
- "tts_model": "facebook/mms-tts-kin",
 
 
 
 
 
44
  "keywords": {
45
  "greeting": ["wΔ©", "mwega", "hello", "hi", "salaam"],
46
  "thanks": ["mwega", "thank", "asante"],
@@ -59,7 +70,10 @@ LANGUAGES = {
59
  "emoji": "πŸ‡³πŸ‡¬",
60
  "speakers": "45M",
61
  "region": "West Africa",
62
- "tts_model": "facebook/mms-tts-yor",
 
 
 
63
  "keywords": {
64
  "greeting": ["pele", "hello", "hi", "bawo"],
65
  "thanks": ["e ku", "thank", "ope"],
@@ -78,7 +92,10 @@ LANGUAGES = {
78
  "emoji": "πŸ‡³πŸ‡¬",
79
  "speakers": "90M",
80
  "region": "West Africa",
81
- "tts_model": "facebook/mms-tts-hau",
 
 
 
82
  "keywords": {
83
  "greeting": ["sannu", "hello", "hi", "ina"],
84
  "thanks": ["nagode", "thank"],
@@ -97,7 +114,10 @@ LANGUAGES = {
97
  "emoji": "πŸ‡ͺπŸ‡Ή",
98
  "speakers": "32M",
99
  "region": "Horn of Africa",
100
- "tts_model": "facebook/mms-tts-amh",
 
 
 
101
  "keywords": {
102
  "greeting": ["αˆ°αˆ‹αˆ", "hello", "hi", "αˆ³αˆ‹αˆ"],
103
  "thanks": ["αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ", "thank"],
@@ -116,7 +136,10 @@ LANGUAGES = {
116
  "emoji": "πŸ‡§πŸ‡―",
117
  "speakers": "2M",
118
  "region": "West Africa",
119
- "tts_model": "facebook/mms-tts-fon",
 
 
 
120
  "keywords": {
121
  "greeting": ["bonjour", "hello", "hi"],
122
  "thanks": ["merci", "thank"],
@@ -135,7 +158,10 @@ LANGUAGES = {
135
  "emoji": "πŸ‡ͺπŸ‡Ή",
136
  "speakers": "40M",
137
  "region": "East Africa",
138
- "tts_model": "facebook/mms-tts-orm",
 
 
 
139
  "keywords": {
140
  "greeting": ["salaam", "hello", "hi"],
141
  "thanks": ["galataa", "thank"],
@@ -154,7 +180,10 @@ LANGUAGES = {
154
  "emoji": "πŸ‡ΈπŸ‡΄",
155
  "speakers": "20M",
156
  "region": "East Africa",
157
- "tts_model": "facebook/mms-tts-som",
 
 
 
158
  "keywords": {
159
  "greeting": ["salaam", "hello", "hi"],
160
  "thanks": ["mahadsanid", "thank"],
@@ -173,7 +202,10 @@ LANGUAGES = {
173
  "emoji": "πŸ‡ͺπŸ‡·",
174
  "speakers": "7M",
175
  "region": "Horn of Africa",
176
- "tts_model": "facebook/mms-tts-tir",
 
 
 
177
  "keywords": {
178
  "greeting": ["αˆ°αˆ‹αˆ", "hello", "hi"],
179
  "thanks": ["αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ", "thank"],
@@ -192,7 +224,10 @@ LANGUAGES = {
192
  "emoji": "🌍",
193
  "speakers": "1.5B",
194
  "region": "Global",
195
- "tts_model": "facebook/mms-tts-eng",
 
 
 
196
  "keywords": {
197
  "greeting": ["hello", "hi", "hey", "greetings"],
198
  "thanks": ["thank", "thanks", "appreciate"],
@@ -212,19 +247,13 @@ LANGUAGES = {
212
  conversation_history = []
213
  model_cache = {}
214
 
215
- def load_tts_model(language_name):
216
- """Load TTS model for the specified language."""
217
- if language_name not in LANGUAGES:
218
- return None
219
-
220
- lang_config = LANGUAGES[language_name]
221
- model_id = lang_config["tts_model"]
222
-
223
  if model_id in model_cache:
224
  return model_cache[model_id]
225
 
226
  try:
227
- print(f"Loading TTS model for {language_name}: {model_id}")
228
  synthesizer = pipeline(
229
  "text-to-speech",
230
  model=model_id,
@@ -255,7 +284,6 @@ def generate_response(text, language):
255
  lang_config = LANGUAGES.get(language, {})
256
  responses = lang_config.get("responses", {})
257
 
258
- # Detect intent
259
  intent = detect_intent(text, language)
260
  response = responses.get(intent, responses.get("default", "I understand."))
261
 
@@ -264,17 +292,17 @@ def generate_response(text, language):
264
  print(f"Error generating response: {e}")
265
  return "I understand. Can you say more?"
266
 
267
- def synthesize_speech(text, language):
268
- """Convert text to speech using HF models."""
269
  if not text or not text.strip():
270
  return None
271
 
272
  try:
273
- synthesizer = load_tts_model(language)
274
  if synthesizer is None:
275
  return None
276
 
277
- print(f"Generating speech for: {text[:50]}...")
278
  speech = synthesizer(text)
279
 
280
  audio_array = np.array(speech["audio"]).flatten()
@@ -289,36 +317,32 @@ def synthesize_speech(text, language):
289
  print(f"Error synthesizing: {e}")
290
  return None
291
 
292
- def process_text_input(text, language):
293
  """Process text input: generate response -> synthesize."""
294
  try:
295
  if not text:
296
  return None, "Please enter some text!", ""
297
 
298
- # Generate response
299
  response_text = generate_response(text, language)
300
  if response_text is None:
301
  return None, "Error generating response", ""
302
 
303
- # Synthesize response
304
- audio_output = synthesize_speech(response_text, language)
305
 
306
- # Add to history
307
  conversation_history.append({
308
  "user": text,
309
  "agent": response_text,
310
  "language": language,
 
311
  "timestamp": datetime.now().strftime("%H:%M:%S")
312
  })
313
 
314
- # Format history
315
  history_text = ""
316
  for msg in conversation_history[-5:]:
317
  history_text += f"[{msg['timestamp']}] {msg['language']}\n"
318
  history_text += f"You: {msg['user']}\n"
319
  history_text += f"Agent: {msg['agent']}\n\n"
320
 
321
- status = "βœ… Speech generated!" if audio_output else "⚠️ Text response only"
322
  return audio_output, response_text, history_text
323
  except Exception as e:
324
  print(f"Error processing: {e}")
@@ -335,7 +359,7 @@ def create_interface():
335
  gr.Markdown("""
336
  # 🌍 JamboGPT - African Language AI Voice Agent
337
 
338
- **Chat with AI in 10 African languages with voice responses**
339
 
340
  Swahili β€’ Kikuyu β€’ Yoruba β€’ Hausa β€’ Amharic β€’ Fon β€’ Oromo β€’ Somali β€’ Tigrinya β€’ English
341
  """)
@@ -354,13 +378,37 @@ def create_interface():
354
  f"πŸ‡°πŸ‡ͺ **Swahili** β€’ 100M+ speakers β€’ East Africa"
355
  )
356
 
 
 
 
 
 
 
 
 
 
 
 
357
  def update_language_info(language):
358
  if language in LANGUAGES:
359
  lang_data = LANGUAGES[language]
360
- return f"{lang_data['emoji']} **{language}** β€’ {lang_data['speakers']} speakers β€’ {lang_data['region']}"
361
- return ""
 
 
 
 
 
 
 
 
 
362
 
363
- language_choice.change(update_language_info, inputs=language_choice, outputs=language_info)
 
 
 
 
364
 
365
  # Text input
366
  text_input = gr.Textbox(
@@ -391,7 +439,6 @@ def create_interface():
391
  interactive=False
392
  )
393
 
394
- # Conversation history
395
  history_display = gr.Textbox(
396
  label="πŸ“ Conversation History",
397
  interactive=False,
@@ -402,7 +449,7 @@ def create_interface():
402
  # Connect process button
403
  process_btn.click(
404
  fn=process_text_input,
405
- inputs=[text_input, language_choice],
406
  outputs=[audio_output, agent_response, history_display]
407
  )
408
 
@@ -424,7 +471,7 @@ def create_interface():
424
  ---
425
  **JamboGPT** - Making AI Accessible to African Languages
426
 
427
- πŸ”— [GitHub](https://github.com/stano03/jambogpt) | πŸ“Š [Dataset](https://huggingface.co/datasets/stano03/jambogpt-real-dataset) | πŸ€– [Model](https://huggingface.co/stano03/jambogpt-swahili-tts-v1)
428
  """)
429
 
430
  return demo
 
1
  #!/usr/bin/env python3
2
  """
3
  JamboGPT - African Language AI Voice Agent
4
+ Multiple TTS Models for Kiswahili & Kikuyu
5
  """
6
 
7
  import gradio as gr
 
15
  # Set device
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
 
18
+ # Language configurations with multiple TTS models
19
  LANGUAGES = {
20
  "Swahili": {
21
  "emoji": "πŸ‡°πŸ‡ͺ",
22
  "speakers": "100M+",
23
  "region": "East Africa",
24
+ "tts_models": [
25
+ ("Benjamin-png/swahili-mms-tts-finetuned", "🌟 Benjamin TTS (Best Quality)"),
26
+ ("facebook/mms-tts-swh", "Meta MMS Swahili"),
27
+ ("multilingual-tts/F5-TTS-OpenBible-Swahili", "F5 TTS OpenBible"),
28
+ ("stano03/jambogpt-swahili-tts-v1", "JamboGPT Custom Model"),
29
+ ],
30
+ "default_model": "Benjamin-png/swahili-mms-tts-finetuned",
31
  "keywords": {
32
  "greeting": ["habari", "jambo", "salaam", "hello", "hi"],
33
  "thanks": ["asante", "thank", "shukran"],
 
46
  "emoji": "πŸ‡°πŸ‡ͺ",
47
  "speakers": "7M",
48
  "region": "Kenya",
49
+ "tts_models": [
50
+ ("multilingual-tts/F5-TTS-OpenBible-Kikuyu", "🌟 F5 TTS OpenBible (Best)"),
51
+ ("facebook/mms-tts-kin", "Meta MMS Kikuyu"),
52
+ ("multilingual-tts/VITS-OpenBible-Kikuyu", "VITS OpenBible"),
53
+ ],
54
+ "default_model": "multilingual-tts/F5-TTS-OpenBible-Kikuyu",
55
  "keywords": {
56
  "greeting": ["wΔ©", "mwega", "hello", "hi", "salaam"],
57
  "thanks": ["mwega", "thank", "asante"],
 
70
  "emoji": "πŸ‡³πŸ‡¬",
71
  "speakers": "45M",
72
  "region": "West Africa",
73
+ "tts_models": [
74
+ ("facebook/mms-tts-yor", "Meta MMS Yoruba"),
75
+ ],
76
+ "default_model": "facebook/mms-tts-yor",
77
  "keywords": {
78
  "greeting": ["pele", "hello", "hi", "bawo"],
79
  "thanks": ["e ku", "thank", "ope"],
 
92
  "emoji": "πŸ‡³πŸ‡¬",
93
  "speakers": "90M",
94
  "region": "West Africa",
95
+ "tts_models": [
96
+ ("facebook/mms-tts-hau", "Meta MMS Hausa"),
97
+ ],
98
+ "default_model": "facebook/mms-tts-hau",
99
  "keywords": {
100
  "greeting": ["sannu", "hello", "hi", "ina"],
101
  "thanks": ["nagode", "thank"],
 
114
  "emoji": "πŸ‡ͺπŸ‡Ή",
115
  "speakers": "32M",
116
  "region": "Horn of Africa",
117
+ "tts_models": [
118
+ ("facebook/mms-tts-amh", "Meta MMS Amharic"),
119
+ ],
120
+ "default_model": "facebook/mms-tts-amh",
121
  "keywords": {
122
  "greeting": ["αˆ°αˆ‹αˆ", "hello", "hi", "αˆ³αˆ‹αˆ"],
123
  "thanks": ["αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ", "thank"],
 
136
  "emoji": "πŸ‡§πŸ‡―",
137
  "speakers": "2M",
138
  "region": "West Africa",
139
+ "tts_models": [
140
+ ("facebook/mms-tts-fon", "Meta MMS Fon"),
141
+ ],
142
+ "default_model": "facebook/mms-tts-fon",
143
  "keywords": {
144
  "greeting": ["bonjour", "hello", "hi"],
145
  "thanks": ["merci", "thank"],
 
158
  "emoji": "πŸ‡ͺπŸ‡Ή",
159
  "speakers": "40M",
160
  "region": "East Africa",
161
+ "tts_models": [
162
+ ("facebook/mms-tts-orm", "Meta MMS Oromo"),
163
+ ],
164
+ "default_model": "facebook/mms-tts-orm",
165
  "keywords": {
166
  "greeting": ["salaam", "hello", "hi"],
167
  "thanks": ["galataa", "thank"],
 
180
  "emoji": "πŸ‡ΈπŸ‡΄",
181
  "speakers": "20M",
182
  "region": "East Africa",
183
+ "tts_models": [
184
+ ("facebook/mms-tts-som", "Meta MMS Somali"),
185
+ ],
186
+ "default_model": "facebook/mms-tts-som",
187
  "keywords": {
188
  "greeting": ["salaam", "hello", "hi"],
189
  "thanks": ["mahadsanid", "thank"],
 
202
  "emoji": "πŸ‡ͺπŸ‡·",
203
  "speakers": "7M",
204
  "region": "Horn of Africa",
205
+ "tts_models": [
206
+ ("facebook/mms-tts-tir", "Meta MMS Tigrinya"),
207
+ ],
208
+ "default_model": "facebook/mms-tts-tir",
209
  "keywords": {
210
  "greeting": ["αˆ°αˆ‹αˆ", "hello", "hi"],
211
  "thanks": ["αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ", "thank"],
 
224
  "emoji": "🌍",
225
  "speakers": "1.5B",
226
  "region": "Global",
227
+ "tts_models": [
228
+ ("facebook/mms-tts-eng", "Meta MMS English"),
229
+ ],
230
+ "default_model": "facebook/mms-tts-eng",
231
  "keywords": {
232
  "greeting": ["hello", "hi", "hey", "greetings"],
233
  "thanks": ["thank", "thanks", "appreciate"],
 
247
  conversation_history = []
248
  model_cache = {}
249
 
250
+ def load_tts_model(model_id):
251
+ """Load TTS model."""
 
 
 
 
 
 
252
  if model_id in model_cache:
253
  return model_cache[model_id]
254
 
255
  try:
256
+ print(f"Loading TTS model: {model_id}")
257
  synthesizer = pipeline(
258
  "text-to-speech",
259
  model=model_id,
 
284
  lang_config = LANGUAGES.get(language, {})
285
  responses = lang_config.get("responses", {})
286
 
 
287
  intent = detect_intent(text, language)
288
  response = responses.get(intent, responses.get("default", "I understand."))
289
 
 
292
  print(f"Error generating response: {e}")
293
  return "I understand. Can you say more?"
294
 
295
+ def synthesize_speech(text, language, model_name):
296
+ """Convert text to speech using selected model."""
297
  if not text or not text.strip():
298
  return None
299
 
300
  try:
301
+ synthesizer = load_tts_model(model_name)
302
  if synthesizer is None:
303
  return None
304
 
305
+ print(f"Generating speech with {model_name}: {text[:50]}...")
306
  speech = synthesizer(text)
307
 
308
  audio_array = np.array(speech["audio"]).flatten()
 
317
  print(f"Error synthesizing: {e}")
318
  return None
319
 
320
+ def process_text_input(text, language, tts_model):
321
  """Process text input: generate response -> synthesize."""
322
  try:
323
  if not text:
324
  return None, "Please enter some text!", ""
325
 
 
326
  response_text = generate_response(text, language)
327
  if response_text is None:
328
  return None, "Error generating response", ""
329
 
330
+ audio_output = synthesize_speech(response_text, language, tts_model)
 
331
 
 
332
  conversation_history.append({
333
  "user": text,
334
  "agent": response_text,
335
  "language": language,
336
+ "model": tts_model,
337
  "timestamp": datetime.now().strftime("%H:%M:%S")
338
  })
339
 
 
340
  history_text = ""
341
  for msg in conversation_history[-5:]:
342
  history_text += f"[{msg['timestamp']}] {msg['language']}\n"
343
  history_text += f"You: {msg['user']}\n"
344
  history_text += f"Agent: {msg['agent']}\n\n"
345
 
 
346
  return audio_output, response_text, history_text
347
  except Exception as e:
348
  print(f"Error processing: {e}")
 
359
  gr.Markdown("""
360
  # 🌍 JamboGPT - African Language AI Voice Agent
361
 
362
+ **Chat with AI in 10 African languages with multiple voice options**
363
 
364
  Swahili β€’ Kikuyu β€’ Yoruba β€’ Hausa β€’ Amharic β€’ Fon β€’ Oromo β€’ Somali β€’ Tigrinya β€’ English
365
  """)
 
378
  f"πŸ‡°πŸ‡ͺ **Swahili** β€’ 100M+ speakers β€’ East Africa"
379
  )
380
 
381
+ # TTS Model selector (dynamic based on language)
382
+ tts_model_choice = gr.Dropdown(
383
+ choices=[("🌟 Benjamin TTS (Best Quality)", "Benjamin-png/swahili-mms-tts-finetuned"),
384
+ ("Meta MMS Swahili", "facebook/mms-tts-swh"),
385
+ ("F5 TTS OpenBible", "multilingual-tts/F5-TTS-OpenBible-Swahili"),
386
+ ("JamboGPT Custom Model", "stano03/jambogpt-swahili-tts-v1")],
387
+ value="Benjamin-png/swahili-mms-tts-finetuned",
388
+ label="Select Voice Model",
389
+ interactive=True
390
+ )
391
+
392
  def update_language_info(language):
393
  if language in LANGUAGES:
394
  lang_data = LANGUAGES[language]
395
+ models = lang_data.get("tts_models", [])
396
+
397
+ # Update language info
398
+ info_text = f"{lang_data['emoji']} **{language}** β€’ {lang_data['speakers']} speakers β€’ {lang_data['region']}"
399
+
400
+ # Update model choices
401
+ model_choices = models
402
+ default_model = lang_data.get("default_model", models[0][0])
403
+
404
+ return info_text, gr.Dropdown(choices=model_choices, value=default_model)
405
+ return "", gr.Dropdown(choices=[])
406
 
407
+ language_choice.change(
408
+ update_language_info,
409
+ inputs=language_choice,
410
+ outputs=[language_info, tts_model_choice]
411
+ )
412
 
413
  # Text input
414
  text_input = gr.Textbox(
 
439
  interactive=False
440
  )
441
 
 
442
  history_display = gr.Textbox(
443
  label="πŸ“ Conversation History",
444
  interactive=False,
 
449
  # Connect process button
450
  process_btn.click(
451
  fn=process_text_input,
452
+ inputs=[text_input, language_choice, tts_model_choice],
453
  outputs=[audio_output, agent_response, history_display]
454
  )
455
 
 
471
  ---
472
  **JamboGPT** - Making AI Accessible to African Languages
473
 
474
+ πŸ”— [GitHub](https://github.com/stano03/jambogpt) | πŸ“Š [Dataset](https://huggingface.co/datasets/stano03/jambogpt-real-dataset) | πŸ€– [Models](https://huggingface.co/stano03)
475
  """)
476
 
477
  return demo