JamboGPT Bot commited on
Commit
504b4e1
Β·
1 Parent(s): dd6ced7

Add free Hugging Face TTS models for voice generation

Browse files
Files changed (1) hide show
  1. app.py +117 -31
app.py CHANGED
@@ -1,18 +1,27 @@
1
  #!/usr/bin/env python3
2
  """
3
  JamboGPT - African Language AI Voice Agent
4
- Simple, Lightweight Version for Hugging Face Spaces
5
  """
6
 
7
  import gradio as gr
8
  from datetime import datetime
 
 
 
 
 
9
 
10
- # Language configurations
 
 
 
11
  LANGUAGES = {
12
  "Swahili": {
13
  "emoji": "πŸ‡°πŸ‡ͺ",
14
  "speakers": "100M+",
15
  "region": "East Africa",
 
16
  "keywords": {
17
  "greeting": ["habari", "jambo", "salaam", "hello", "hi"],
18
  "thanks": ["asante", "thank", "shukran"],
@@ -31,6 +40,7 @@ LANGUAGES = {
31
  "emoji": "πŸ‡°πŸ‡ͺ",
32
  "speakers": "7M",
33
  "region": "Kenya",
 
34
  "keywords": {
35
  "greeting": ["wΔ©", "mwega", "hello", "hi", "salaam"],
36
  "thanks": ["mwega", "thank", "asante"],
@@ -49,6 +59,7 @@ LANGUAGES = {
49
  "emoji": "πŸ‡³πŸ‡¬",
50
  "speakers": "45M",
51
  "region": "West Africa",
 
52
  "keywords": {
53
  "greeting": ["pele", "hello", "hi", "bawo"],
54
  "thanks": ["e ku", "thank", "ope"],
@@ -67,6 +78,7 @@ LANGUAGES = {
67
  "emoji": "πŸ‡³πŸ‡¬",
68
  "speakers": "90M",
69
  "region": "West Africa",
 
70
  "keywords": {
71
  "greeting": ["sannu", "hello", "hi", "ina"],
72
  "thanks": ["nagode", "thank"],
@@ -85,6 +97,7 @@ LANGUAGES = {
85
  "emoji": "πŸ‡ͺπŸ‡Ή",
86
  "speakers": "32M",
87
  "region": "Horn of Africa",
 
88
  "keywords": {
89
  "greeting": ["αˆ°αˆ‹αˆ", "hello", "hi", "αˆ³αˆ‹αˆ"],
90
  "thanks": ["αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ", "thank"],
@@ -103,6 +116,7 @@ LANGUAGES = {
103
  "emoji": "πŸ‡§πŸ‡―",
104
  "speakers": "2M",
105
  "region": "West Africa",
 
106
  "keywords": {
107
  "greeting": ["bonjour", "hello", "hi"],
108
  "thanks": ["merci", "thank"],
@@ -121,6 +135,7 @@ LANGUAGES = {
121
  "emoji": "πŸ‡ͺπŸ‡Ή",
122
  "speakers": "40M",
123
  "region": "East Africa",
 
124
  "keywords": {
125
  "greeting": ["salaam", "hello", "hi"],
126
  "thanks": ["galataa", "thank"],
@@ -139,6 +154,7 @@ LANGUAGES = {
139
  "emoji": "πŸ‡ΈπŸ‡΄",
140
  "speakers": "20M",
141
  "region": "East Africa",
 
142
  "keywords": {
143
  "greeting": ["salaam", "hello", "hi"],
144
  "thanks": ["mahadsanid", "thank"],
@@ -157,6 +173,7 @@ LANGUAGES = {
157
  "emoji": "πŸ‡ͺπŸ‡·",
158
  "speakers": "7M",
159
  "region": "Horn of Africa",
 
160
  "keywords": {
161
  "greeting": ["αˆ°αˆ‹αˆ", "hello", "hi"],
162
  "thanks": ["αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ", "thank"],
@@ -175,6 +192,7 @@ LANGUAGES = {
175
  "emoji": "🌍",
176
  "speakers": "1.5B",
177
  "region": "Global",
 
178
  "keywords": {
179
  "greeting": ["hello", "hi", "hey", "greetings"],
180
  "thanks": ["thank", "thanks", "appreciate"],
@@ -192,6 +210,31 @@ LANGUAGES = {
192
  }
193
 
194
  conversation_history = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
  def detect_intent(text, language):
197
  """Detect user intent from text."""
@@ -216,31 +259,83 @@ def generate_response(text, language):
216
  intent = detect_intent(text, language)
217
  response = responses.get(intent, responses.get("default", "I understand."))
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  # Add to history
220
  conversation_history.append({
221
  "user": text,
222
- "agent": response,
223
  "language": language,
224
  "timestamp": datetime.now().strftime("%H:%M:%S")
225
  })
226
 
227
- return response
 
 
 
 
 
 
 
 
228
  except Exception as e:
229
- print(f"Error generating response: {e}")
230
- return "I understand. Can you say more?"
231
 
232
  def create_interface():
233
  """Create the voice agent interface."""
234
 
235
  with gr.Blocks(
236
- title="JamboGPT - African Language AI",
237
  theme=gr.themes.Soft(primary_hue="purple")
238
  ) as demo:
239
 
240
  gr.Markdown("""
241
- # 🌍 JamboGPT - African Language AI
242
 
243
- **Chat with AI in 10 African languages**
244
 
245
  Swahili β€’ Kikuyu β€’ Yoruba β€’ Hausa β€’ Amharic β€’ Fon β€’ Oromo β€’ Somali β€’ Tigrinya β€’ English
246
  """)
@@ -290,6 +385,12 @@ def create_interface():
290
  placeholder="The agent's response will appear here"
291
  )
292
 
 
 
 
 
 
 
293
  # Conversation history
294
  history_display = gr.Textbox(
295
  label="πŸ“ Conversation History",
@@ -299,25 +400,10 @@ def create_interface():
299
  )
300
 
301
  # Connect process button
302
- def process_input(text, language):
303
- if not text:
304
- return "Please enter some text!", ""
305
-
306
- response = generate_response(text, language)
307
-
308
- # Format history
309
- history_text = ""
310
- for msg in conversation_history[-5:]: # Show last 5 messages
311
- history_text += f"[{msg['timestamp']}] {msg['language']}\n"
312
- history_text += f"You: {msg['user']}\n"
313
- history_text += f"Agent: {msg['agent']}\n\n"
314
-
315
- return response, history_text
316
-
317
  process_btn.click(
318
- fn=process_input,
319
  inputs=[text_input, language_choice],
320
- outputs=[agent_response, history_display]
321
  )
322
 
323
  # Examples
@@ -329,8 +415,8 @@ def create_interface():
329
  ["Hello, how are you?", "English"],
330
  ],
331
  inputs=[text_input, language_choice],
332
- outputs=[agent_response],
333
- fn=process_input,
334
  cache_examples=False,
335
  )
336
 
@@ -338,17 +424,17 @@ def create_interface():
338
  ---
339
  **JamboGPT** - Making AI Accessible to African Languages
340
 
341
- πŸ”— [GitHub](https://github.com/stano03/jambogpt) | πŸ“Š [Dataset](https://huggingface.co/datasets/stano03/jambogpt-real-dataset)
342
  """)
343
 
344
  return demo
345
 
346
  if __name__ == "__main__":
347
- print("πŸš€ Creating JamboGPT Interface...")
348
  demo = create_interface()
349
 
350
  print("=" * 50)
351
- print("βœ… JamboGPT is ready!")
352
  print("=" * 50)
353
 
354
  demo.launch(
 
1
  #!/usr/bin/env python3
2
  """
3
  JamboGPT - African Language AI Voice Agent
4
+ Using Free Hugging Face TTS Models
5
  """
6
 
7
  import gradio as gr
8
  from datetime import datetime
9
+ import torch
10
+ from transformers import pipeline
11
+ import numpy as np
12
+ from scipy.io import wavfile
13
+ import tempfile
14
 
15
+ # Set device
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+
18
+ # Language configurations with free HF TTS models
19
  LANGUAGES = {
20
  "Swahili": {
21
  "emoji": "πŸ‡°πŸ‡ͺ",
22
  "speakers": "100M+",
23
  "region": "East Africa",
24
+ "tts_model": "facebook/mms-tts-swh",
25
  "keywords": {
26
  "greeting": ["habari", "jambo", "salaam", "hello", "hi"],
27
  "thanks": ["asante", "thank", "shukran"],
 
40
  "emoji": "πŸ‡°πŸ‡ͺ",
41
  "speakers": "7M",
42
  "region": "Kenya",
43
+ "tts_model": "facebook/mms-tts-kin",
44
  "keywords": {
45
  "greeting": ["wΔ©", "mwega", "hello", "hi", "salaam"],
46
  "thanks": ["mwega", "thank", "asante"],
 
59
  "emoji": "πŸ‡³πŸ‡¬",
60
  "speakers": "45M",
61
  "region": "West Africa",
62
+ "tts_model": "facebook/mms-tts-yor",
63
  "keywords": {
64
  "greeting": ["pele", "hello", "hi", "bawo"],
65
  "thanks": ["e ku", "thank", "ope"],
 
78
  "emoji": "πŸ‡³πŸ‡¬",
79
  "speakers": "90M",
80
  "region": "West Africa",
81
+ "tts_model": "facebook/mms-tts-hau",
82
  "keywords": {
83
  "greeting": ["sannu", "hello", "hi", "ina"],
84
  "thanks": ["nagode", "thank"],
 
97
  "emoji": "πŸ‡ͺπŸ‡Ή",
98
  "speakers": "32M",
99
  "region": "Horn of Africa",
100
+ "tts_model": "facebook/mms-tts-amh",
101
  "keywords": {
102
  "greeting": ["αˆ°αˆ‹αˆ", "hello", "hi", "αˆ³αˆ‹αˆ"],
103
  "thanks": ["αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ", "thank"],
 
116
  "emoji": "πŸ‡§πŸ‡―",
117
  "speakers": "2M",
118
  "region": "West Africa",
119
+ "tts_model": "facebook/mms-tts-fon",
120
  "keywords": {
121
  "greeting": ["bonjour", "hello", "hi"],
122
  "thanks": ["merci", "thank"],
 
135
  "emoji": "πŸ‡ͺπŸ‡Ή",
136
  "speakers": "40M",
137
  "region": "East Africa",
138
+ "tts_model": "facebook/mms-tts-orm",
139
  "keywords": {
140
  "greeting": ["salaam", "hello", "hi"],
141
  "thanks": ["galataa", "thank"],
 
154
  "emoji": "πŸ‡ΈπŸ‡΄",
155
  "speakers": "20M",
156
  "region": "East Africa",
157
+ "tts_model": "facebook/mms-tts-som",
158
  "keywords": {
159
  "greeting": ["salaam", "hello", "hi"],
160
  "thanks": ["mahadsanid", "thank"],
 
173
  "emoji": "πŸ‡ͺπŸ‡·",
174
  "speakers": "7M",
175
  "region": "Horn of Africa",
176
+ "tts_model": "facebook/mms-tts-tir",
177
  "keywords": {
178
  "greeting": ["αˆ°αˆ‹αˆ", "hello", "hi"],
179
  "thanks": ["αŠ αˆ˜αˆ°αŒαŠ“αˆˆαˆ", "thank"],
 
192
  "emoji": "🌍",
193
  "speakers": "1.5B",
194
  "region": "Global",
195
+ "tts_model": "facebook/mms-tts-eng",
196
  "keywords": {
197
  "greeting": ["hello", "hi", "hey", "greetings"],
198
  "thanks": ["thank", "thanks", "appreciate"],
 
210
  }
211
 
212
  conversation_history = []
213
+ model_cache = {}
214
+
215
+ def load_tts_model(language_name):
216
+ """Load TTS model for the specified language."""
217
+ if language_name not in LANGUAGES:
218
+ return None
219
+
220
+ lang_config = LANGUAGES[language_name]
221
+ model_id = lang_config["tts_model"]
222
+
223
+ if model_id in model_cache:
224
+ return model_cache[model_id]
225
+
226
+ try:
227
+ print(f"Loading TTS model for {language_name}: {model_id}")
228
+ synthesizer = pipeline(
229
+ "text-to-speech",
230
+ model=model_id,
231
+ device=device if device == "cuda" else -1
232
+ )
233
+ model_cache[model_id] = synthesizer
234
+ return synthesizer
235
+ except Exception as e:
236
+ print(f"Error loading model {model_id}: {e}")
237
+ return None
238
 
239
  def detect_intent(text, language):
240
  """Detect user intent from text."""
 
259
  intent = detect_intent(text, language)
260
  response = responses.get(intent, responses.get("default", "I understand."))
261
 
262
+ return response
263
+ except Exception as e:
264
+ print(f"Error generating response: {e}")
265
+ return "I understand. Can you say more?"
266
+
267
+ def synthesize_speech(text, language):
268
+ """Convert text to speech using HF models."""
269
+ if not text or not text.strip():
270
+ return None
271
+
272
+ try:
273
+ synthesizer = load_tts_model(language)
274
+ if synthesizer is None:
275
+ return None
276
+
277
+ print(f"Generating speech for: {text[:50]}...")
278
+ speech = synthesizer(text)
279
+
280
+ audio_array = np.array(speech["audio"]).flatten()
281
+ sample_rate = speech["sampling_rate"]
282
+
283
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
284
+ wavfile.write(f.name, sample_rate, (audio_array * 32767).astype(np.int16))
285
+ temp_path = f.name
286
+
287
+ return temp_path
288
+ except Exception as e:
289
+ print(f"Error synthesizing: {e}")
290
+ return None
291
+
292
+ def process_text_input(text, language):
293
+ """Process text input: generate response -> synthesize."""
294
+ try:
295
+ if not text:
296
+ return None, "Please enter some text!", ""
297
+
298
+ # Generate response
299
+ response_text = generate_response(text, language)
300
+ if response_text is None:
301
+ return None, "Error generating response", ""
302
+
303
+ # Synthesize response
304
+ audio_output = synthesize_speech(response_text, language)
305
+
306
  # Add to history
307
  conversation_history.append({
308
  "user": text,
309
+ "agent": response_text,
310
  "language": language,
311
  "timestamp": datetime.now().strftime("%H:%M:%S")
312
  })
313
 
314
+ # Format history
315
+ history_text = ""
316
+ for msg in conversation_history[-5:]:
317
+ history_text += f"[{msg['timestamp']}] {msg['language']}\n"
318
+ history_text += f"You: {msg['user']}\n"
319
+ history_text += f"Agent: {msg['agent']}\n\n"
320
+
321
+ status = "βœ… Speech generated!" if audio_output else "⚠️ Text response only"
322
+ return audio_output, response_text, history_text
323
  except Exception as e:
324
+ print(f"Error processing: {e}")
325
+ return None, f"Error: {str(e)}", ""
326
 
327
  def create_interface():
328
  """Create the voice agent interface."""
329
 
330
  with gr.Blocks(
331
+ title="JamboGPT - African Language AI Voice Agent",
332
  theme=gr.themes.Soft(primary_hue="purple")
333
  ) as demo:
334
 
335
  gr.Markdown("""
336
+ # 🌍 JamboGPT - African Language AI Voice Agent
337
 
338
+ **Chat with AI in 10 African languages with voice responses**
339
 
340
  Swahili β€’ Kikuyu β€’ Yoruba β€’ Hausa β€’ Amharic β€’ Fon β€’ Oromo β€’ Somali β€’ Tigrinya β€’ English
341
  """)
 
385
  placeholder="The agent's response will appear here"
386
  )
387
 
388
+ audio_output = gr.Audio(
389
+ label="πŸ”Š Agent Voice",
390
+ type="filepath",
391
+ interactive=False
392
+ )
393
+
394
  # Conversation history
395
  history_display = gr.Textbox(
396
  label="πŸ“ Conversation History",
 
400
  )
401
 
402
  # Connect process button
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  process_btn.click(
404
+ fn=process_text_input,
405
  inputs=[text_input, language_choice],
406
+ outputs=[audio_output, agent_response, history_display]
407
  )
408
 
409
  # Examples
 
415
  ["Hello, how are you?", "English"],
416
  ],
417
  inputs=[text_input, language_choice],
418
+ outputs=[audio_output, agent_response],
419
+ fn=process_text_input,
420
  cache_examples=False,
421
  )
422
 
 
424
  ---
425
  **JamboGPT** - Making AI Accessible to African Languages
426
 
427
+ πŸ”— [GitHub](https://github.com/stano03/jambogpt) | πŸ“Š [Dataset](https://huggingface.co/datasets/stano03/jambogpt-real-dataset) | πŸ€– [Model](https://huggingface.co/stano03/jambogpt-swahili-tts-v1)
428
  """)
429
 
430
  return demo
431
 
432
  if __name__ == "__main__":
433
+ print("πŸš€ Creating JamboGPT Voice Agent Interface...")
434
  demo = create_interface()
435
 
436
  print("=" * 50)
437
+ print("βœ… JamboGPT Voice Agent is ready!")
438
  print("=" * 50)
439
 
440
  demo.launch(