Spaces:

stano03
/

jambogpt

Sleeping

App Files Files Community

JamboGPT Bot commited on 4 days ago

Commit

2e235f6

1 Parent(s): eab1e34

🤖 Feature: AI Voice Agent for Kiswahili and Kikuyu with speech recognition and synthesis

Browse files

Files changed (1) hide show

app.py +255 -129

app.py CHANGED Viewed

@@ -1,105 +1,72 @@
 #!/usr/bin/env python3
 """
-JamboGPT - African Language AI
 Inspired by Yarn GPT's clean, professional design.
-A Gradio-based application for Text-to-Speech in African languages.
 """
 import os
 import gradio as gr
 import torch
-from transformers import pipeline
 import numpy as np
 from scipy.io import wavfile
 import tempfile
-import json
 from datetime import datetime
 # Set device
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"🌍 Starting JamboGPT - African Language AI")
 print(f"Using device: {device}")
 print("=" * 50)
 # Language configurations
 LANGUAGES = {
-    "Swahili": {
         "code": "swh",
         "tts_model": "facebook/mms-tts-swh",
         "emoji": "🇰🇪",
         "speakers": "100M+",
-        "region": "East Africa"
     },
     "Kikuyu": {
         "code": "ki",
         "tts_model": "BrianMwangi/African-Kikuyu-TTS",
         "emoji": "🇰🇪",
         "speakers": "7M",
-        "region": "Kenya"
-    },
-    "Yoruba": {
-        "code": "yor",
-        "tts_model": "facebook/mms-tts-yor",
-        "emoji": "🇳🇬",
-        "speakers": "45M",
-        "region": "West Africa"
-    },
-    "Hausa": {
-        "code": "hau",
-        "tts_model": "facebook/mms-tts-hau",
-        "emoji": "🇳🇬",
-        "speakers": "90M",
-        "region": "West Africa"
-    },
-    "Amharic": {
-        "code": "amh",
-        "tts_model": "facebook/mms-tts-amh",
-        "emoji": "🇪🇹",
-        "speakers": "32M",
-        "region": "Ethiopia"
-    },
-    "Fon": {
-        "code": "fon",
-        "tts_model": "facebook/mms-tts-fon",
-        "emoji": "🇧🇯",
-        "speakers": "2M",
-        "region": "Benin, Togo"
-    },
-    "Oromo": {
-        "code": "orm",
-        "tts_model": "facebook/mms-tts-orm",
-        "emoji": "🇪🇹",
-        "speakers": "40M",
-        "region": "Ethiopia, Kenya"
-    },
-    "Somali": {
-        "code": "som",
-        "tts_model": "facebook/mms-tts-som",
-        "emoji": "🇸🇴",
-        "speakers": "20M",
-        "region": "East Africa"
-    },
-    "Tigrinya": {
-        "code": "tir",
-        "tts_model": "facebook/mms-tts-tir",
-        "emoji": "🇪🇷",
-        "speakers": "7M",
-        "region": "Horn of Africa"
-    },
-    "English": {
-        "code": "eng",
-        "tts_model": "facebook/mms-tts-eng",
-        "emoji": "🌍",
-        "speakers": "1.5B",
-        "region": "Global"
-    },
 }
 # Cache for loaded models
 model_cache = {}
-# History storage
-history = []
 # CSS inspired by Yarn GPT
 CUSTOM_CSS = """
@@ -271,7 +238,7 @@ body {
 textarea {
     width: 100% !important;
-    min-height: 120px !important;
     padding: 16px !important;
     border: 1px solid #d0d0d0 !important;
     border-radius: 6px !important;
@@ -295,6 +262,7 @@ textarea:focus {
     display: flex;
     gap: 12px;
     margin-top: 16px;
 }
 .generate-btn {
@@ -318,6 +286,22 @@ textarea:focus {
     transform: scale(0.98) !important;
 }
 .output-section {
     background: #f8f9fa;
     border-radius: 8px;
@@ -352,6 +336,12 @@ textarea:focus {
     border: 1px solid #f5c6cb;
 }
 .audio-player {
     width: 100%;
 }
@@ -363,6 +353,51 @@ textarea:focus {
     padding: 8px 0;
 }
 @media (max-width: 768px) {
     .main-container {
         grid-template-columns: 1fr;
@@ -394,7 +429,6 @@ def load_tts_model(language_name):
     lang_config = LANGUAGES[language_name]
     model_id = lang_config["tts_model"]
-    # Check cache
     if model_id in model_cache:
         return model_cache[model_id]
@@ -411,13 +445,75 @@ def load_tts_model(language_name):
         print(f"Error loading model {model_id}: {e}")
         return None
-def generate_speech(text, language):
-    """Generate speech from text in the specified language."""
-    if not text or not text.strip():
-        return None, "❌ Please enter some text to generate speech."
-    if len(text) > 1000:
-        return None, "❌ Text is too long. Maximum 1000 characters allowed."
     try:
         synthesizer = load_tts_model(language)
@@ -425,58 +521,73 @@ def generate_speech(text, language):
             return None, f"❌ Failed to load TTS model for {language}."
         print(f"Generating speech for: {text[:50]}...")
-        # Generate speech
         speech = synthesizer(text)
-        # Extract audio
         audio_array = np.array(speech["audio"]).flatten()
         sample_rate = speech["sampling_rate"]
-        # Save to temporary file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             wavfile.write(f.name, sample_rate, (audio_array * 32767).astype(np.int16))
             temp_path = f.name
-        # Add to history
-        history.append({
-            "text": text[:50] + "..." if len(text) > 50 else text,
-            "language": language,
             "timestamp": datetime.now().isoformat()
         })
-        return temp_path, f"✅ Speech generated successfully in {language}!"
     except Exception as e:
-        print(f"Error generating speech: {e}")
-        return None, f"❌ Error generating speech: {str(e)}"
 def create_interface():
-    """Create the Yarn GPT-inspired interface."""
     with gr.Blocks(
-        title="JamboGPT - African Language AI",
         css=CUSTOM_CSS
     ) as demo:
-        # Main container with sidebar
         with gr.Row(equal_height=True):
             # Sidebar
             with gr.Column(scale=0, min_width=350):
                 gr.Markdown(
                     """
                     <div class="sidebar">
-                        <div class="sidebar-title">Recent Generations</div>
                     </div>
                     """
                 )
-                # History display
                 history_display = gr.Markdown(
                     """
                     <div class="sidebar">
                         <div style="text-align: center; color: #999; padding: 20px; font-size: 13px;">
-                        No recent generations yet
                         </div>
                     </div>
                     """
@@ -489,25 +600,22 @@ def create_interface():
                     """
                     <div class="header">
                         <div class="logo">🌍 JamboGPT</div>
-                        <div class="headline">African Language AI: The No 1 Multilingual Text-to-Speech Engine</div>
-                        <div class="subheadline">Generate natural-sounding speech in 9 African languages. Integrate with a simple API or use our web interface.</div>
                     </div>
                     """
                 )
                 # Tabs
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        gr.Markdown(
-                            """
-                            <div class="tabs-container">
-                                <button class="tab-button active">🎤 Text Input</button>
-                                <button class="tab-button">📚 Document</button>
-                                <button class="tab-button">💬 Conversation</button>
-                                <button class="tab-button">📊 Batch</button>
-                            </div>
-                            """
-                        )
                 # Input section
                 with gr.Group():
@@ -516,14 +624,14 @@ def create_interface():
                     # Language selector
                     language_choice = gr.Dropdown(
                         choices=list(LANGUAGES.keys()),
-                        value="Swahili",
                         label="Select Language",
                         interactive=True
                     )
                     # Language info
                     language_info = gr.Markdown(
-                        f"🇰🇪 **Swahili** • 100M+ speakers • East Africa"
                     )
                     def update_language_info(language):
@@ -534,17 +642,18 @@ def create_interface():
                     language_choice.change(update_language_info, inputs=language_choice, outputs=language_info)
-                    # Text input
-                    text_input = gr.Textbox(
-                        label="Enter your text here to generate a single audio file.",
-                        placeholder="Type your text in the selected language...",
-                        lines=4,
                         interactive=True
                     )
-                    # Generate button
-                    generate_btn = gr.Button(
-                        "🎵 Generate Speech",
                         variant="primary",
                         size="lg"
                     )
@@ -555,18 +664,35 @@ def create_interface():
                 with gr.Group():
                     gr.Markdown('<div class="output-section">')
-                    gr.Markdown('<div class="output-label">Generated Audio</div>')
                     audio_output = gr.Audio(
                         label="",
                         type="filepath",
                         interactive=False
                     )
                     status_message = gr.Textbox(
                         label="Status",
                         interactive=False,
-                        value="Ready to generate speech!"
                     )
                     gr.Markdown('</div>')
@@ -575,27 +701,27 @@ def create_interface():
                 gr.Markdown(
                     """
                     <div style="margin-top: 40px; padding-top: 20px; border-top: 1px solid #e0e0e0; font-size: 13px; color: #999;">
-                    <p>🌍 <strong>JamboGPT</strong> - Making AI accessible in African languages</p>
-                    <p>Powered by Meta's Massively Multilingual Speech (MMS) • <a href="https://huggingface.co/spaces/stano03/jambogpt" style="color: #666;">View on Hugging Face</a></p>
                     </div>
                     """
                 )
-        # Connect generate button
-        generate_btn.click(
-            fn=generate_speech,
-            inputs=[text_input, language_choice],
-            outputs=[audio_output, status_message]
         )
     return demo
 if __name__ == "__main__":
-    print("🚀 Creating JamboGPT Interface...")
     demo = create_interface()
     print("=" * 50)
-    print("✅ JamboGPT is ready!")
     print("=" * 50)
     demo.launch(

 #!/usr/bin/env python3
 """
+JamboGPT - African Language AI Voice Agent
+Specialized for Kiswahili and Kikuyu with voice input/output.
 Inspired by Yarn GPT's clean, professional design.
 """
 import os
 import gradio as gr
 import torch
+import torchaudio
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import numpy as np
 from scipy.io import wavfile
 import tempfile
 from datetime import datetime
+import json
 # Set device
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"🌍 Starting JamboGPT - African Language AI Voice Agent")
 print(f"Using device: {device}")
 print("=" * 50)
 # Language configurations
 LANGUAGES = {
+    "Kiswahili": {
         "code": "swh",
         "tts_model": "facebook/mms-tts-swh",
         "emoji": "🇰🇪",
         "speakers": "100M+",
+        "region": "East Africa",
+        "greetings": [
+            "Habari! Jina lako nani?",
+            "Karibu! Unajifunza nini leo?",
+            "Habari yako? Niweza kusaidia?",
+            "Asante kwa kukamatia! Unajifunza nini?"
+        ],
+        "responses": {
+            "greeting": "Habari! Niko hapa kusaidia. Unajifunza nini leo?",
+            "help": "Niweza kusaidia kwa swahili. Tafadhali niambie unajifunza nini.",
+            "thanks": "Asante sana! Niko hapa kila wakati.",
+            "bye": "Kwaheri! Karibu tena mwingine wakati."
+        }
     },
     "Kikuyu": {
         "code": "ki",
         "tts_model": "BrianMwangi/African-Kikuyu-TTS",
         "emoji": "🇰🇪",
         "speakers": "7M",
+        "region": "Kenya",
+        "greetings": [
+            "Wĩ mwega! Wĩ ũrĩa mwega?",
+            "Karibu! Nĩguo mwega!",
+            "Mwega! Nĩ ũndũ ũrĩkũ?",
+            "Wĩ mwega! Nĩkĩo kĩndũ?"
+        ],
+        "responses": {
+            "greeting": "Wĩ mwega! Nĩ ũndũ ũrĩkũ?",
+            "help": "Nĩ mwega! Nĩkĩo kĩndũ kĩrĩa ũrĩ na kĩo?",
+            "thanks": "Mwega muno! Nĩ mwega.",
+            "bye": "Rĩa rĩu! Wĩ mwega!"
+        }
+    }
 }
 # Cache for loaded models
 model_cache = {}
+conversation_history = []
 # CSS inspired by Yarn GPT
 CUSTOM_CSS = """
 textarea {
     width: 100% !important;
+    min-height: 100px !important;
     padding: 16px !important;
     border: 1px solid #d0d0d0 !important;
     border-radius: 6px !important;
     display: flex;
     gap: 12px;
     margin-top: 16px;
+    flex-wrap: wrap;
 }
 .generate-btn {
     transform: scale(0.98) !important;
 }
+.secondary-btn {
+    background: #f0f0f0 !important;
+    color: #333 !important;
+    border: 1px solid #d0d0d0 !important;
+    border-radius: 6px !important;
+    padding: 12px 24px !important;
+    font-weight: 600 !important;
+    font-size: 14px !important;
+    cursor: pointer !important;
+    transition: all 0.2s ease !important;
+}
+.secondary-btn:hover {
+    background: #e0e0e0 !important;
+}
 .output-section {
     background: #f8f9fa;
     border-radius: 8px;
     border: 1px solid #f5c6cb;
 }
+.status-info {
+    background: #d1ecf1;
+    color: #0c5460;
+    border: 1px solid #bee5eb;
+}
 .audio-player {
     width: 100%;
 }
     padding: 8px 0;
 }
+.conversation-display {
+    background: white;
+    border: 1px solid #e0e0e0;
+    border-radius: 6px;
+    padding: 16px;
+    margin-bottom: 16px;
+    max-height: 400px;
+    overflow-y: auto;
+    font-size: 13px;
+}
+.message {
+    margin-bottom: 12px;
+    padding: 8px;
+    border-radius: 4px;
+}
+.user-message {
+    background: #e3f2fd;
+    color: #1565c0;
+    margin-left: 20px;
+    text-align: right;
+}
+.agent-message {
+    background: #f5f5f5;
+    color: #333;
+    margin-right: 20px;
+}
+.recording-indicator {
+    display: inline-block;
+    width: 12px;
+    height: 12px;
+    background: #ff4444;
+    border-radius: 50%;
+    margin-right: 8px;
+    animation: pulse 1s infinite;
+}
+@keyframes pulse {
+    0%, 100% { opacity: 1; }
+    50% { opacity: 0.5; }
+}
 @media (max-width: 768px) {
     .main-container {
         grid-template-columns: 1fr;
     lang_config = LANGUAGES[language_name]
     model_id = lang_config["tts_model"]
     if model_id in model_cache:
         return model_cache[model_id]
         print(f"Error loading model {model_id}: {e}")
         return None
+def load_asr_model():
+    """Load Automatic Speech Recognition model (Whisper)."""
+    if "asr" in model_cache:
+        return model_cache["asr"]
+    try:
+        print("Loading Whisper ASR model...")
+        asr = pipeline(
+            "automatic-speech-recognition",
+            model="openai/whisper-base",
+            device=device if device == "cuda" else -1
+        )
+        model_cache["asr"] = asr
+        return asr
+    except Exception as e:
+        print(f"Error loading ASR model: {e}")
+        return None
+def transcribe_audio(audio_file):
+    """Transcribe audio to text using Whisper."""
+    try:
+        asr = load_asr_model()
+        if asr is None:
+            return None, "❌ Failed to load ASR model."
+        print(f"Transcribing audio...")
+        result = asr(audio_file)
+        text = result.get("text", "").strip()
+        if not text:
+            return None, "❌ Could not transcribe audio. Please try again."
+        return text, f"✅ Transcribed: {text}"
+    except Exception as e:
+        print(f"Error transcribing: {e}")
+        return None, f"❌ Error transcribing: {str(e)}"
+def generate_response(user_text, language):
+    """Generate a response based on user input."""
+    try:
+        # Simple response generation based on keywords
+        user_text_lower = user_text.lower()
+        lang_config = LANGUAGES.get(language, {})
+        responses = lang_config.get("responses", {})
+        # Detect intent
+        if any(word in user_text_lower for word in ["habari", "wĩ", "how", "hello", "hi"]):
+            response = responses.get("greeting", "Habari!")
+        elif any(word in user_text_lower for word in ["asante", "thank", "mwega"]):
+            response = responses.get("thanks", "Asante!")
+        elif any(word in user_text_lower for word in ["bye", "goodbye", "kwaheri", "rĩa"]):
+            response = responses.get("bye", "Kwaheri!")
+        else:
+            # Default response
+            if language == "Kiswahili":
+                response = f"Ninataka kusikia zaidi kuhusu: {user_text}. Unaweza kuandika zaidi?"
+            else:  # Kikuyu
+                response = f"Nĩ mwega! Wĩ ũrĩa mwega? Nĩkĩo kĩndũ?"
+        return response, "✅ Response generated!"
+    except Exception as e:
+        print(f"Error generating response: {e}")
+        return None, f"❌ Error: {str(e)}"
+def synthesize_speech(text, language):
+    """Convert text to speech."""
+    if not text or not text.strip():
+        return None, "❌ No text to synthesize."
     try:
         synthesizer = load_tts_model(language)
             return None, f"❌ Failed to load TTS model for {language}."
         print(f"Generating speech for: {text[:50]}...")
         speech = synthesizer(text)
         audio_array = np.array(speech["audio"]).flatten()
         sample_rate = speech["sampling_rate"]
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             wavfile.write(f.name, sample_rate, (audio_array * 32767).astype(np.int16))
             temp_path = f.name
+        return temp_path, "✅ Speech generated!"
+    except Exception as e:
+        print(f"Error synthesizing: {e}")
+        return None, f"❌ Error: {str(e)}"
+def process_voice_input(audio_input, language):
+    """Process voice input: transcribe -> generate response -> synthesize."""
+    try:
+        # Step 1: Transcribe
+        user_text, transcribe_status = transcribe_audio(audio_input)
+        if user_text is None:
+            return None, None, transcribe_status, ""
+        # Step 2: Generate response
+        response_text, response_status = generate_response(user_text, language)
+        if response_text is None:
+            return None, None, response_status, ""
+        # Step 3: Synthesize response
+        audio_output, synth_status = synthesize_speech(response_text, language)
+        # Add to conversation history
+        conversation_history.append({
+            "user": user_text,
+            "agent": response_text,
             "timestamp": datetime.now().isoformat()
         })
+        return audio_output, response_text, synth_status, user_text
     except Exception as e:
+        print(f"Error processing voice: {e}")
+        return None, None, f"❌ Error: {str(e)}", ""
 def create_interface():
+    """Create the voice agent interface."""
     with gr.Blocks(
+        title="JamboGPT - African Language AI Voice Agent",
         css=CUSTOM_CSS
     ) as demo:
+        # Main container
         with gr.Row(equal_height=True):
             # Sidebar
             with gr.Column(scale=0, min_width=350):
                 gr.Markdown(
                     """
                     <div class="sidebar">
+                        <div class="sidebar-title">🗣️ Conversation History</div>
                     </div>
                     """
                 )
                 history_display = gr.Markdown(
                     """
                     <div class="sidebar">
                         <div style="text-align: center; color: #999; padding: 20px; font-size: 13px;">
+                        No conversations yet
                         </div>
                     </div>
                     """
                     """
                     <div class="header">
                         <div class="logo">🌍 JamboGPT</div>
+                        <div class="headline">African Language AI Voice Agent</div>
+                        <div class="subheadline">Speak in Kiswahili or Kikuyu and have a natural conversation with AI. Your voice is understood, processed, and responded to in your language.</div>
                     </div>
                     """
                 )
                 # Tabs
+                gr.Markdown(
+                    """
+                    <div class="tabs-container">
+                        <button class="tab-button active">🎙️ Voice Agent</button>
+                        <button class="tab-button">📝 Text Mode</button>
+                        <button class="tab-button">⚙️ Settings</button>
+                    </div>
+                    """
+                )
                 # Input section
                 with gr.Group():
                     # Language selector
                     language_choice = gr.Dropdown(
                         choices=list(LANGUAGES.keys()),
+                        value="Kiswahili",
                         label="Select Language",
                         interactive=True
                     )
                     # Language info
                     language_info = gr.Markdown(
+                        f"🇰🇪 **Kiswahili** • 100M+ speakers • East Africa"
                     )
                     def update_language_info(language):
                     language_choice.change(update_language_info, inputs=language_choice, outputs=language_info)
+                    # Voice input
+                    gr.Markdown("**🎤 Speak in your language:**")
+                    audio_input = gr.Audio(
+                        label="Record your voice",
+                        type="filepath",
+                        sources=["microphone"],
                         interactive=True
                     )
+                    # Process button
+                    process_btn = gr.Button(
+                        "🎙️ Process Voice",
                         variant="primary",
                         size="lg"
                     )
                 with gr.Group():
                     gr.Markdown('<div class="output-section">')
+                    # Transcription
+                    gr.Markdown('<div class="output-label">📝 What You Said</div>')
+                    transcription = gr.Textbox(
+                        label="",
+                        interactive=False,
+                        placeholder="Your transcribed text will appear here"
+                    )
+                    # Agent response
+                    gr.Markdown('<div class="output-label">🤖 Agent Response</div>')
+                    agent_response = gr.Textbox(
+                        label="",
+                        interactive=False,
+                        placeholder="The agent's response will appear here"
+                    )
+                    # Audio output
+                    gr.Markdown('<div class="output-label">🔊 Agent Voice</div>')
                     audio_output = gr.Audio(
                         label="",
                         type="filepath",
                         interactive=False
                     )
+                    # Status
                     status_message = gr.Textbox(
                         label="Status",
                         interactive=False,
+                        value="Ready to listen!"
                     )
                     gr.Markdown('</div>')
                 gr.Markdown(
                     """
                     <div style="margin-top: 40px; padding-top: 20px; border-top: 1px solid #e0e0e0; font-size: 13px; color: #999;">
+                    <p>🌍 <strong>JamboGPT</strong> - African Language AI Voice Agent</p>
+                    <p>Speak naturally in Kiswahili or Kikuyu • Powered by Whisper + Hugging Face • <a href="https://huggingface.co/spaces/stano03/jambogpt" style="color: #666;">View on Hugging Face</a></p>
                     </div>
                     """
                 )
+        # Connect process button
+        process_btn.click(
+            fn=process_voice_input,
+            inputs=[audio_input, language_choice],
+            outputs=[audio_output, agent_response, status_message, transcription]
         )
     return demo
 if __name__ == "__main__":
+    print("🚀 Creating JamboGPT Voice Agent Interface...")
     demo = create_interface()
     print("=" * 50)
+    print("✅ JamboGPT Voice Agent is ready!")
     print("=" * 50)
     demo.launch(