JamboGPT Bot commited on
Commit Β·
2e235f6
1
Parent(s): eab1e34
π€ Feature: AI Voice Agent for Kiswahili and Kikuyu with speech recognition and synthesis
Browse files
app.py
CHANGED
|
@@ -1,105 +1,72 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
JamboGPT - African Language AI
|
|
|
|
| 4 |
Inspired by Yarn GPT's clean, professional design.
|
| 5 |
-
A Gradio-based application for Text-to-Speech in African languages.
|
| 6 |
"""
|
| 7 |
|
| 8 |
import os
|
| 9 |
import gradio as gr
|
| 10 |
import torch
|
| 11 |
-
|
|
|
|
| 12 |
import numpy as np
|
| 13 |
from scipy.io import wavfile
|
| 14 |
import tempfile
|
| 15 |
-
import json
|
| 16 |
from datetime import datetime
|
|
|
|
| 17 |
|
| 18 |
# Set device
|
| 19 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 20 |
-
print(f"π Starting JamboGPT - African Language AI")
|
| 21 |
print(f"Using device: {device}")
|
| 22 |
print("=" * 50)
|
| 23 |
|
| 24 |
# Language configurations
|
| 25 |
LANGUAGES = {
|
| 26 |
-
"
|
| 27 |
"code": "swh",
|
| 28 |
"tts_model": "facebook/mms-tts-swh",
|
| 29 |
"emoji": "π°πͺ",
|
| 30 |
"speakers": "100M+",
|
| 31 |
-
"region": "East Africa"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
},
|
| 33 |
"Kikuyu": {
|
| 34 |
"code": "ki",
|
| 35 |
"tts_model": "BrianMwangi/African-Kikuyu-TTS",
|
| 36 |
"emoji": "π°πͺ",
|
| 37 |
"speakers": "7M",
|
| 38 |
-
"region": "Kenya"
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
"
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
"region": "West Africa"
|
| 53 |
-
},
|
| 54 |
-
"Amharic": {
|
| 55 |
-
"code": "amh",
|
| 56 |
-
"tts_model": "facebook/mms-tts-amh",
|
| 57 |
-
"emoji": "πͺπΉ",
|
| 58 |
-
"speakers": "32M",
|
| 59 |
-
"region": "Ethiopia"
|
| 60 |
-
},
|
| 61 |
-
"Fon": {
|
| 62 |
-
"code": "fon",
|
| 63 |
-
"tts_model": "facebook/mms-tts-fon",
|
| 64 |
-
"emoji": "π§π―",
|
| 65 |
-
"speakers": "2M",
|
| 66 |
-
"region": "Benin, Togo"
|
| 67 |
-
},
|
| 68 |
-
"Oromo": {
|
| 69 |
-
"code": "orm",
|
| 70 |
-
"tts_model": "facebook/mms-tts-orm",
|
| 71 |
-
"emoji": "πͺπΉ",
|
| 72 |
-
"speakers": "40M",
|
| 73 |
-
"region": "Ethiopia, Kenya"
|
| 74 |
-
},
|
| 75 |
-
"Somali": {
|
| 76 |
-
"code": "som",
|
| 77 |
-
"tts_model": "facebook/mms-tts-som",
|
| 78 |
-
"emoji": "πΈπ΄",
|
| 79 |
-
"speakers": "20M",
|
| 80 |
-
"region": "East Africa"
|
| 81 |
-
},
|
| 82 |
-
"Tigrinya": {
|
| 83 |
-
"code": "tir",
|
| 84 |
-
"tts_model": "facebook/mms-tts-tir",
|
| 85 |
-
"emoji": "πͺπ·",
|
| 86 |
-
"speakers": "7M",
|
| 87 |
-
"region": "Horn of Africa"
|
| 88 |
-
},
|
| 89 |
-
"English": {
|
| 90 |
-
"code": "eng",
|
| 91 |
-
"tts_model": "facebook/mms-tts-eng",
|
| 92 |
-
"emoji": "π",
|
| 93 |
-
"speakers": "1.5B",
|
| 94 |
-
"region": "Global"
|
| 95 |
-
},
|
| 96 |
}
|
| 97 |
|
| 98 |
# Cache for loaded models
|
| 99 |
model_cache = {}
|
| 100 |
-
|
| 101 |
-
# History storage
|
| 102 |
-
history = []
|
| 103 |
|
| 104 |
# CSS inspired by Yarn GPT
|
| 105 |
CUSTOM_CSS = """
|
|
@@ -271,7 +238,7 @@ body {
|
|
| 271 |
|
| 272 |
textarea {
|
| 273 |
width: 100% !important;
|
| 274 |
-
min-height:
|
| 275 |
padding: 16px !important;
|
| 276 |
border: 1px solid #d0d0d0 !important;
|
| 277 |
border-radius: 6px !important;
|
|
@@ -295,6 +262,7 @@ textarea:focus {
|
|
| 295 |
display: flex;
|
| 296 |
gap: 12px;
|
| 297 |
margin-top: 16px;
|
|
|
|
| 298 |
}
|
| 299 |
|
| 300 |
.generate-btn {
|
|
@@ -318,6 +286,22 @@ textarea:focus {
|
|
| 318 |
transform: scale(0.98) !important;
|
| 319 |
}
|
| 320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
.output-section {
|
| 322 |
background: #f8f9fa;
|
| 323 |
border-radius: 8px;
|
|
@@ -352,6 +336,12 @@ textarea:focus {
|
|
| 352 |
border: 1px solid #f5c6cb;
|
| 353 |
}
|
| 354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
.audio-player {
|
| 356 |
width: 100%;
|
| 357 |
}
|
|
@@ -363,6 +353,51 @@ textarea:focus {
|
|
| 363 |
padding: 8px 0;
|
| 364 |
}
|
| 365 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
@media (max-width: 768px) {
|
| 367 |
.main-container {
|
| 368 |
grid-template-columns: 1fr;
|
|
@@ -394,7 +429,6 @@ def load_tts_model(language_name):
|
|
| 394 |
lang_config = LANGUAGES[language_name]
|
| 395 |
model_id = lang_config["tts_model"]
|
| 396 |
|
| 397 |
-
# Check cache
|
| 398 |
if model_id in model_cache:
|
| 399 |
return model_cache[model_id]
|
| 400 |
|
|
@@ -411,13 +445,75 @@ def load_tts_model(language_name):
|
|
| 411 |
print(f"Error loading model {model_id}: {e}")
|
| 412 |
return None
|
| 413 |
|
| 414 |
-
def
|
| 415 |
-
"""
|
| 416 |
-
if
|
| 417 |
-
return
|
| 418 |
|
| 419 |
-
|
| 420 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
|
| 422 |
try:
|
| 423 |
synthesizer = load_tts_model(language)
|
|
@@ -425,58 +521,73 @@ def generate_speech(text, language):
|
|
| 425 |
return None, f"β Failed to load TTS model for {language}."
|
| 426 |
|
| 427 |
print(f"Generating speech for: {text[:50]}...")
|
| 428 |
-
|
| 429 |
-
# Generate speech
|
| 430 |
speech = synthesizer(text)
|
| 431 |
|
| 432 |
-
# Extract audio
|
| 433 |
audio_array = np.array(speech["audio"]).flatten()
|
| 434 |
sample_rate = speech["sampling_rate"]
|
| 435 |
|
| 436 |
-
# Save to temporary file
|
| 437 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 438 |
wavfile.write(f.name, sample_rate, (audio_array * 32767).astype(np.int16))
|
| 439 |
temp_path = f.name
|
| 440 |
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
"timestamp": datetime.now().isoformat()
|
| 446 |
})
|
| 447 |
|
| 448 |
-
return
|
| 449 |
-
|
| 450 |
except Exception as e:
|
| 451 |
-
print(f"Error
|
| 452 |
-
return None, f"β Error
|
| 453 |
|
| 454 |
def create_interface():
|
| 455 |
-
"""Create the
|
| 456 |
|
| 457 |
with gr.Blocks(
|
| 458 |
-
title="JamboGPT - African Language AI",
|
| 459 |
css=CUSTOM_CSS
|
| 460 |
) as demo:
|
| 461 |
|
| 462 |
-
# Main container
|
| 463 |
with gr.Row(equal_height=True):
|
| 464 |
# Sidebar
|
| 465 |
with gr.Column(scale=0, min_width=350):
|
| 466 |
gr.Markdown(
|
| 467 |
"""
|
| 468 |
<div class="sidebar">
|
| 469 |
-
<div class="sidebar-title">
|
| 470 |
</div>
|
| 471 |
"""
|
| 472 |
)
|
| 473 |
|
| 474 |
-
# History display
|
| 475 |
history_display = gr.Markdown(
|
| 476 |
"""
|
| 477 |
<div class="sidebar">
|
| 478 |
<div style="text-align: center; color: #999; padding: 20px; font-size: 13px;">
|
| 479 |
-
No
|
| 480 |
</div>
|
| 481 |
</div>
|
| 482 |
"""
|
|
@@ -489,25 +600,22 @@ def create_interface():
|
|
| 489 |
"""
|
| 490 |
<div class="header">
|
| 491 |
<div class="logo">π JamboGPT</div>
|
| 492 |
-
<div class="headline">African Language AI
|
| 493 |
-
<div class="subheadline">
|
| 494 |
</div>
|
| 495 |
"""
|
| 496 |
)
|
| 497 |
|
| 498 |
# Tabs
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
</div>
|
| 509 |
-
"""
|
| 510 |
-
)
|
| 511 |
|
| 512 |
# Input section
|
| 513 |
with gr.Group():
|
|
@@ -516,14 +624,14 @@ def create_interface():
|
|
| 516 |
# Language selector
|
| 517 |
language_choice = gr.Dropdown(
|
| 518 |
choices=list(LANGUAGES.keys()),
|
| 519 |
-
value="
|
| 520 |
label="Select Language",
|
| 521 |
interactive=True
|
| 522 |
)
|
| 523 |
|
| 524 |
# Language info
|
| 525 |
language_info = gr.Markdown(
|
| 526 |
-
f"π°πͺ **
|
| 527 |
)
|
| 528 |
|
| 529 |
def update_language_info(language):
|
|
@@ -534,17 +642,18 @@ def create_interface():
|
|
| 534 |
|
| 535 |
language_choice.change(update_language_info, inputs=language_choice, outputs=language_info)
|
| 536 |
|
| 537 |
-
#
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
|
|
|
| 542 |
interactive=True
|
| 543 |
)
|
| 544 |
|
| 545 |
-
#
|
| 546 |
-
|
| 547 |
-
"
|
| 548 |
variant="primary",
|
| 549 |
size="lg"
|
| 550 |
)
|
|
@@ -555,18 +664,35 @@ def create_interface():
|
|
| 555 |
with gr.Group():
|
| 556 |
gr.Markdown('<div class="output-section">')
|
| 557 |
|
| 558 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
|
|
|
|
|
|
|
| 560 |
audio_output = gr.Audio(
|
| 561 |
label="",
|
| 562 |
type="filepath",
|
| 563 |
interactive=False
|
| 564 |
)
|
| 565 |
|
|
|
|
| 566 |
status_message = gr.Textbox(
|
| 567 |
label="Status",
|
| 568 |
interactive=False,
|
| 569 |
-
value="Ready to
|
| 570 |
)
|
| 571 |
|
| 572 |
gr.Markdown('</div>')
|
|
@@ -575,27 +701,27 @@ def create_interface():
|
|
| 575 |
gr.Markdown(
|
| 576 |
"""
|
| 577 |
<div style="margin-top: 40px; padding-top: 20px; border-top: 1px solid #e0e0e0; font-size: 13px; color: #999;">
|
| 578 |
-
<p>π <strong>JamboGPT</strong> -
|
| 579 |
-
<p>Powered by
|
| 580 |
</div>
|
| 581 |
"""
|
| 582 |
)
|
| 583 |
|
| 584 |
-
# Connect
|
| 585 |
-
|
| 586 |
-
fn=
|
| 587 |
-
inputs=[
|
| 588 |
-
outputs=[audio_output, status_message]
|
| 589 |
)
|
| 590 |
|
| 591 |
return demo
|
| 592 |
|
| 593 |
if __name__ == "__main__":
|
| 594 |
-
print("π Creating JamboGPT Interface...")
|
| 595 |
demo = create_interface()
|
| 596 |
|
| 597 |
print("=" * 50)
|
| 598 |
-
print("β
JamboGPT is ready!")
|
| 599 |
print("=" * 50)
|
| 600 |
|
| 601 |
demo.launch(
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
JamboGPT - African Language AI Voice Agent
|
| 4 |
+
Specialized for Kiswahili and Kikuyu with voice input/output.
|
| 5 |
Inspired by Yarn GPT's clean, professional design.
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
import os
|
| 9 |
import gradio as gr
|
| 10 |
import torch
|
| 11 |
+
import torchaudio
|
| 12 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
|
| 13 |
import numpy as np
|
| 14 |
from scipy.io import wavfile
|
| 15 |
import tempfile
|
|
|
|
| 16 |
from datetime import datetime
|
| 17 |
+
import json
|
| 18 |
|
| 19 |
# Set device
|
| 20 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 21 |
+
print(f"π Starting JamboGPT - African Language AI Voice Agent")
|
| 22 |
print(f"Using device: {device}")
|
| 23 |
print("=" * 50)
|
| 24 |
|
| 25 |
# Language configurations
|
| 26 |
LANGUAGES = {
|
| 27 |
+
"Kiswahili": {
|
| 28 |
"code": "swh",
|
| 29 |
"tts_model": "facebook/mms-tts-swh",
|
| 30 |
"emoji": "π°πͺ",
|
| 31 |
"speakers": "100M+",
|
| 32 |
+
"region": "East Africa",
|
| 33 |
+
"greetings": [
|
| 34 |
+
"Habari! Jina lako nani?",
|
| 35 |
+
"Karibu! Unajifunza nini leo?",
|
| 36 |
+
"Habari yako? Niweza kusaidia?",
|
| 37 |
+
"Asante kwa kukamatia! Unajifunza nini?"
|
| 38 |
+
],
|
| 39 |
+
"responses": {
|
| 40 |
+
"greeting": "Habari! Niko hapa kusaidia. Unajifunza nini leo?",
|
| 41 |
+
"help": "Niweza kusaidia kwa swahili. Tafadhali niambie unajifunza nini.",
|
| 42 |
+
"thanks": "Asante sana! Niko hapa kila wakati.",
|
| 43 |
+
"bye": "Kwaheri! Karibu tena mwingine wakati."
|
| 44 |
+
}
|
| 45 |
},
|
| 46 |
"Kikuyu": {
|
| 47 |
"code": "ki",
|
| 48 |
"tts_model": "BrianMwangi/African-Kikuyu-TTS",
|
| 49 |
"emoji": "π°πͺ",
|
| 50 |
"speakers": "7M",
|
| 51 |
+
"region": "Kenya",
|
| 52 |
+
"greetings": [
|
| 53 |
+
"WΔ© mwega! WΔ© Ε©rΔ©a mwega?",
|
| 54 |
+
"Karibu! NΔ©guo mwega!",
|
| 55 |
+
"Mwega! NΔ© Ε©ndΕ© Ε©rΔ©kΕ©?",
|
| 56 |
+
"WΔ© mwega! NΔ©kΔ©o kΔ©ndΕ©?"
|
| 57 |
+
],
|
| 58 |
+
"responses": {
|
| 59 |
+
"greeting": "WΔ© mwega! NΔ© Ε©ndΕ© Ε©rΔ©kΕ©?",
|
| 60 |
+
"help": "NΔ© mwega! NΔ©kΔ©o kΔ©ndΕ© kΔ©rΔ©a Ε©rΔ© na kΔ©o?",
|
| 61 |
+
"thanks": "Mwega muno! NΔ© mwega.",
|
| 62 |
+
"bye": "RΔ©a rΔ©u! WΔ© mwega!"
|
| 63 |
+
}
|
| 64 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
}
|
| 66 |
|
| 67 |
# Cache for loaded models
|
| 68 |
model_cache = {}
|
| 69 |
+
conversation_history = []
|
|
|
|
|
|
|
| 70 |
|
| 71 |
# CSS inspired by Yarn GPT
|
| 72 |
CUSTOM_CSS = """
|
|
|
|
| 238 |
|
| 239 |
textarea {
|
| 240 |
width: 100% !important;
|
| 241 |
+
min-height: 100px !important;
|
| 242 |
padding: 16px !important;
|
| 243 |
border: 1px solid #d0d0d0 !important;
|
| 244 |
border-radius: 6px !important;
|
|
|
|
| 262 |
display: flex;
|
| 263 |
gap: 12px;
|
| 264 |
margin-top: 16px;
|
| 265 |
+
flex-wrap: wrap;
|
| 266 |
}
|
| 267 |
|
| 268 |
.generate-btn {
|
|
|
|
| 286 |
transform: scale(0.98) !important;
|
| 287 |
}
|
| 288 |
|
| 289 |
+
.secondary-btn {
|
| 290 |
+
background: #f0f0f0 !important;
|
| 291 |
+
color: #333 !important;
|
| 292 |
+
border: 1px solid #d0d0d0 !important;
|
| 293 |
+
border-radius: 6px !important;
|
| 294 |
+
padding: 12px 24px !important;
|
| 295 |
+
font-weight: 600 !important;
|
| 296 |
+
font-size: 14px !important;
|
| 297 |
+
cursor: pointer !important;
|
| 298 |
+
transition: all 0.2s ease !important;
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
.secondary-btn:hover {
|
| 302 |
+
background: #e0e0e0 !important;
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
.output-section {
|
| 306 |
background: #f8f9fa;
|
| 307 |
border-radius: 8px;
|
|
|
|
| 336 |
border: 1px solid #f5c6cb;
|
| 337 |
}
|
| 338 |
|
| 339 |
+
.status-info {
|
| 340 |
+
background: #d1ecf1;
|
| 341 |
+
color: #0c5460;
|
| 342 |
+
border: 1px solid #bee5eb;
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
.audio-player {
|
| 346 |
width: 100%;
|
| 347 |
}
|
|
|
|
| 353 |
padding: 8px 0;
|
| 354 |
}
|
| 355 |
|
| 356 |
+
.conversation-display {
|
| 357 |
+
background: white;
|
| 358 |
+
border: 1px solid #e0e0e0;
|
| 359 |
+
border-radius: 6px;
|
| 360 |
+
padding: 16px;
|
| 361 |
+
margin-bottom: 16px;
|
| 362 |
+
max-height: 400px;
|
| 363 |
+
overflow-y: auto;
|
| 364 |
+
font-size: 13px;
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
.message {
|
| 368 |
+
margin-bottom: 12px;
|
| 369 |
+
padding: 8px;
|
| 370 |
+
border-radius: 4px;
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
.user-message {
|
| 374 |
+
background: #e3f2fd;
|
| 375 |
+
color: #1565c0;
|
| 376 |
+
margin-left: 20px;
|
| 377 |
+
text-align: right;
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
.agent-message {
|
| 381 |
+
background: #f5f5f5;
|
| 382 |
+
color: #333;
|
| 383 |
+
margin-right: 20px;
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
.recording-indicator {
|
| 387 |
+
display: inline-block;
|
| 388 |
+
width: 12px;
|
| 389 |
+
height: 12px;
|
| 390 |
+
background: #ff4444;
|
| 391 |
+
border-radius: 50%;
|
| 392 |
+
margin-right: 8px;
|
| 393 |
+
animation: pulse 1s infinite;
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
@keyframes pulse {
|
| 397 |
+
0%, 100% { opacity: 1; }
|
| 398 |
+
50% { opacity: 0.5; }
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
@media (max-width: 768px) {
|
| 402 |
.main-container {
|
| 403 |
grid-template-columns: 1fr;
|
|
|
|
| 429 |
lang_config = LANGUAGES[language_name]
|
| 430 |
model_id = lang_config["tts_model"]
|
| 431 |
|
|
|
|
| 432 |
if model_id in model_cache:
|
| 433 |
return model_cache[model_id]
|
| 434 |
|
|
|
|
| 445 |
print(f"Error loading model {model_id}: {e}")
|
| 446 |
return None
|
| 447 |
|
| 448 |
+
def load_asr_model():
|
| 449 |
+
"""Load Automatic Speech Recognition model (Whisper)."""
|
| 450 |
+
if "asr" in model_cache:
|
| 451 |
+
return model_cache["asr"]
|
| 452 |
|
| 453 |
+
try:
|
| 454 |
+
print("Loading Whisper ASR model...")
|
| 455 |
+
asr = pipeline(
|
| 456 |
+
"automatic-speech-recognition",
|
| 457 |
+
model="openai/whisper-base",
|
| 458 |
+
device=device if device == "cuda" else -1
|
| 459 |
+
)
|
| 460 |
+
model_cache["asr"] = asr
|
| 461 |
+
return asr
|
| 462 |
+
except Exception as e:
|
| 463 |
+
print(f"Error loading ASR model: {e}")
|
| 464 |
+
return None
|
| 465 |
+
|
| 466 |
+
def transcribe_audio(audio_file):
|
| 467 |
+
"""Transcribe audio to text using Whisper."""
|
| 468 |
+
try:
|
| 469 |
+
asr = load_asr_model()
|
| 470 |
+
if asr is None:
|
| 471 |
+
return None, "β Failed to load ASR model."
|
| 472 |
+
|
| 473 |
+
print(f"Transcribing audio...")
|
| 474 |
+
result = asr(audio_file)
|
| 475 |
+
text = result.get("text", "").strip()
|
| 476 |
+
|
| 477 |
+
if not text:
|
| 478 |
+
return None, "β Could not transcribe audio. Please try again."
|
| 479 |
+
|
| 480 |
+
return text, f"β
Transcribed: {text}"
|
| 481 |
+
except Exception as e:
|
| 482 |
+
print(f"Error transcribing: {e}")
|
| 483 |
+
return None, f"β Error transcribing: {str(e)}"
|
| 484 |
+
|
| 485 |
+
def generate_response(user_text, language):
|
| 486 |
+
"""Generate a response based on user input."""
|
| 487 |
+
try:
|
| 488 |
+
# Simple response generation based on keywords
|
| 489 |
+
user_text_lower = user_text.lower()
|
| 490 |
+
|
| 491 |
+
lang_config = LANGUAGES.get(language, {})
|
| 492 |
+
responses = lang_config.get("responses", {})
|
| 493 |
+
|
| 494 |
+
# Detect intent
|
| 495 |
+
if any(word in user_text_lower for word in ["habari", "wΔ©", "how", "hello", "hi"]):
|
| 496 |
+
response = responses.get("greeting", "Habari!")
|
| 497 |
+
elif any(word in user_text_lower for word in ["asante", "thank", "mwega"]):
|
| 498 |
+
response = responses.get("thanks", "Asante!")
|
| 499 |
+
elif any(word in user_text_lower for word in ["bye", "goodbye", "kwaheri", "rΔ©a"]):
|
| 500 |
+
response = responses.get("bye", "Kwaheri!")
|
| 501 |
+
else:
|
| 502 |
+
# Default response
|
| 503 |
+
if language == "Kiswahili":
|
| 504 |
+
response = f"Ninataka kusikia zaidi kuhusu: {user_text}. Unaweza kuandika zaidi?"
|
| 505 |
+
else: # Kikuyu
|
| 506 |
+
response = f"NΔ© mwega! WΔ© Ε©rΔ©a mwega? NΔ©kΔ©o kΔ©ndΕ©?"
|
| 507 |
+
|
| 508 |
+
return response, "β
Response generated!"
|
| 509 |
+
except Exception as e:
|
| 510 |
+
print(f"Error generating response: {e}")
|
| 511 |
+
return None, f"β Error: {str(e)}"
|
| 512 |
+
|
| 513 |
+
def synthesize_speech(text, language):
|
| 514 |
+
"""Convert text to speech."""
|
| 515 |
+
if not text or not text.strip():
|
| 516 |
+
return None, "β No text to synthesize."
|
| 517 |
|
| 518 |
try:
|
| 519 |
synthesizer = load_tts_model(language)
|
|
|
|
| 521 |
return None, f"β Failed to load TTS model for {language}."
|
| 522 |
|
| 523 |
print(f"Generating speech for: {text[:50]}...")
|
|
|
|
|
|
|
| 524 |
speech = synthesizer(text)
|
| 525 |
|
|
|
|
| 526 |
audio_array = np.array(speech["audio"]).flatten()
|
| 527 |
sample_rate = speech["sampling_rate"]
|
| 528 |
|
|
|
|
| 529 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 530 |
wavfile.write(f.name, sample_rate, (audio_array * 32767).astype(np.int16))
|
| 531 |
temp_path = f.name
|
| 532 |
|
| 533 |
+
return temp_path, "β
Speech generated!"
|
| 534 |
+
except Exception as e:
|
| 535 |
+
print(f"Error synthesizing: {e}")
|
| 536 |
+
return None, f"β Error: {str(e)}"
|
| 537 |
+
|
| 538 |
+
def process_voice_input(audio_input, language):
|
| 539 |
+
"""Process voice input: transcribe -> generate response -> synthesize."""
|
| 540 |
+
try:
|
| 541 |
+
# Step 1: Transcribe
|
| 542 |
+
user_text, transcribe_status = transcribe_audio(audio_input)
|
| 543 |
+
if user_text is None:
|
| 544 |
+
return None, None, transcribe_status, ""
|
| 545 |
+
|
| 546 |
+
# Step 2: Generate response
|
| 547 |
+
response_text, response_status = generate_response(user_text, language)
|
| 548 |
+
if response_text is None:
|
| 549 |
+
return None, None, response_status, ""
|
| 550 |
+
|
| 551 |
+
# Step 3: Synthesize response
|
| 552 |
+
audio_output, synth_status = synthesize_speech(response_text, language)
|
| 553 |
+
|
| 554 |
+
# Add to conversation history
|
| 555 |
+
conversation_history.append({
|
| 556 |
+
"user": user_text,
|
| 557 |
+
"agent": response_text,
|
| 558 |
"timestamp": datetime.now().isoformat()
|
| 559 |
})
|
| 560 |
|
| 561 |
+
return audio_output, response_text, synth_status, user_text
|
|
|
|
| 562 |
except Exception as e:
|
| 563 |
+
print(f"Error processing voice: {e}")
|
| 564 |
+
return None, None, f"β Error: {str(e)}", ""
|
| 565 |
|
| 566 |
def create_interface():
|
| 567 |
+
"""Create the voice agent interface."""
|
| 568 |
|
| 569 |
with gr.Blocks(
|
| 570 |
+
title="JamboGPT - African Language AI Voice Agent",
|
| 571 |
css=CUSTOM_CSS
|
| 572 |
) as demo:
|
| 573 |
|
| 574 |
+
# Main container
|
| 575 |
with gr.Row(equal_height=True):
|
| 576 |
# Sidebar
|
| 577 |
with gr.Column(scale=0, min_width=350):
|
| 578 |
gr.Markdown(
|
| 579 |
"""
|
| 580 |
<div class="sidebar">
|
| 581 |
+
<div class="sidebar-title">π£οΈ Conversation History</div>
|
| 582 |
</div>
|
| 583 |
"""
|
| 584 |
)
|
| 585 |
|
|
|
|
| 586 |
history_display = gr.Markdown(
|
| 587 |
"""
|
| 588 |
<div class="sidebar">
|
| 589 |
<div style="text-align: center; color: #999; padding: 20px; font-size: 13px;">
|
| 590 |
+
No conversations yet
|
| 591 |
</div>
|
| 592 |
</div>
|
| 593 |
"""
|
|
|
|
| 600 |
"""
|
| 601 |
<div class="header">
|
| 602 |
<div class="logo">π JamboGPT</div>
|
| 603 |
+
<div class="headline">African Language AI Voice Agent</div>
|
| 604 |
+
<div class="subheadline">Speak in Kiswahili or Kikuyu and have a natural conversation with AI. Your voice is understood, processed, and responded to in your language.</div>
|
| 605 |
</div>
|
| 606 |
"""
|
| 607 |
)
|
| 608 |
|
| 609 |
# Tabs
|
| 610 |
+
gr.Markdown(
|
| 611 |
+
"""
|
| 612 |
+
<div class="tabs-container">
|
| 613 |
+
<button class="tab-button active">ποΈ Voice Agent</button>
|
| 614 |
+
<button class="tab-button">π Text Mode</button>
|
| 615 |
+
<button class="tab-button">βοΈ Settings</button>
|
| 616 |
+
</div>
|
| 617 |
+
"""
|
| 618 |
+
)
|
|
|
|
|
|
|
|
|
|
| 619 |
|
| 620 |
# Input section
|
| 621 |
with gr.Group():
|
|
|
|
| 624 |
# Language selector
|
| 625 |
language_choice = gr.Dropdown(
|
| 626 |
choices=list(LANGUAGES.keys()),
|
| 627 |
+
value="Kiswahili",
|
| 628 |
label="Select Language",
|
| 629 |
interactive=True
|
| 630 |
)
|
| 631 |
|
| 632 |
# Language info
|
| 633 |
language_info = gr.Markdown(
|
| 634 |
+
f"π°πͺ **Kiswahili** β’ 100M+ speakers β’ East Africa"
|
| 635 |
)
|
| 636 |
|
| 637 |
def update_language_info(language):
|
|
|
|
| 642 |
|
| 643 |
language_choice.change(update_language_info, inputs=language_choice, outputs=language_info)
|
| 644 |
|
| 645 |
+
# Voice input
|
| 646 |
+
gr.Markdown("**π€ Speak in your language:**")
|
| 647 |
+
audio_input = gr.Audio(
|
| 648 |
+
label="Record your voice",
|
| 649 |
+
type="filepath",
|
| 650 |
+
sources=["microphone"],
|
| 651 |
interactive=True
|
| 652 |
)
|
| 653 |
|
| 654 |
+
# Process button
|
| 655 |
+
process_btn = gr.Button(
|
| 656 |
+
"ποΈ Process Voice",
|
| 657 |
variant="primary",
|
| 658 |
size="lg"
|
| 659 |
)
|
|
|
|
| 664 |
with gr.Group():
|
| 665 |
gr.Markdown('<div class="output-section">')
|
| 666 |
|
| 667 |
+
# Transcription
|
| 668 |
+
gr.Markdown('<div class="output-label">π What You Said</div>')
|
| 669 |
+
transcription = gr.Textbox(
|
| 670 |
+
label="",
|
| 671 |
+
interactive=False,
|
| 672 |
+
placeholder="Your transcribed text will appear here"
|
| 673 |
+
)
|
| 674 |
+
|
| 675 |
+
# Agent response
|
| 676 |
+
gr.Markdown('<div class="output-label">π€ Agent Response</div>')
|
| 677 |
+
agent_response = gr.Textbox(
|
| 678 |
+
label="",
|
| 679 |
+
interactive=False,
|
| 680 |
+
placeholder="The agent's response will appear here"
|
| 681 |
+
)
|
| 682 |
|
| 683 |
+
# Audio output
|
| 684 |
+
gr.Markdown('<div class="output-label">π Agent Voice</div>')
|
| 685 |
audio_output = gr.Audio(
|
| 686 |
label="",
|
| 687 |
type="filepath",
|
| 688 |
interactive=False
|
| 689 |
)
|
| 690 |
|
| 691 |
+
# Status
|
| 692 |
status_message = gr.Textbox(
|
| 693 |
label="Status",
|
| 694 |
interactive=False,
|
| 695 |
+
value="Ready to listen!"
|
| 696 |
)
|
| 697 |
|
| 698 |
gr.Markdown('</div>')
|
|
|
|
| 701 |
gr.Markdown(
|
| 702 |
"""
|
| 703 |
<div style="margin-top: 40px; padding-top: 20px; border-top: 1px solid #e0e0e0; font-size: 13px; color: #999;">
|
| 704 |
+
<p>π <strong>JamboGPT</strong> - African Language AI Voice Agent</p>
|
| 705 |
+
<p>Speak naturally in Kiswahili or Kikuyu β’ Powered by Whisper + Hugging Face β’ <a href="https://huggingface.co/spaces/stano03/jambogpt" style="color: #666;">View on Hugging Face</a></p>
|
| 706 |
</div>
|
| 707 |
"""
|
| 708 |
)
|
| 709 |
|
| 710 |
+
# Connect process button
|
| 711 |
+
process_btn.click(
|
| 712 |
+
fn=process_voice_input,
|
| 713 |
+
inputs=[audio_input, language_choice],
|
| 714 |
+
outputs=[audio_output, agent_response, status_message, transcription]
|
| 715 |
)
|
| 716 |
|
| 717 |
return demo
|
| 718 |
|
| 719 |
if __name__ == "__main__":
|
| 720 |
+
print("π Creating JamboGPT Voice Agent Interface...")
|
| 721 |
demo = create_interface()
|
| 722 |
|
| 723 |
print("=" * 50)
|
| 724 |
+
print("β
JamboGPT Voice Agent is ready!")
|
| 725 |
print("=" * 50)
|
| 726 |
|
| 727 |
demo.launch(
|