Spaces:

Natwar
/

VoiceAnalysis

Sleeping

App Files Files Community

Natwar commited on Mar 21

Commit

18c8b56

verified ·

1 Parent(s): e89b55b

Update app.py

Browse files

Files changed (1) hide show

app.py +200 -270

app.py CHANGED Viewed

@@ -3,75 +3,109 @@
 import os
 import subprocess
 import sys
-import pkg_resources
 import time
 import tempfile
-import numpy as np
 import warnings
-from pathlib import Path
 warnings.filterwarnings("ignore")
-def install_package(package, version=None):
-    package_spec = f"{package}=={version}" if version else package
-    print(f"Installing {package_spec}...")
     try:
-        subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", package_spec])
-    except subprocess.CalledProcessError as e:
-        print(f"Failed to install {package_spec}: {e}")
-        raise
-# Required packages (you may add version pins if necessary)
-required_packages = {
-    "gradio": None,
-    "torch": None,
-    "torchaudio": None,
-    "transformers": None,
-    "librosa": None,
-    "scipy": None,
-    "matplotlib": None,
-    "pydub": None
-}
-installed_packages = {pkg.key for pkg in pkg_resources.working_set}
-for package, version in required_packages.items():
-    if package not in installed_packages:
-        install_package(package, version)
-# Now import all necessary packages
 import gradio as gr
 import torch
 import torchaudio
 import librosa
 import matplotlib.pyplot as plt
-from matplotlib.colors import LinearSegmentedColormap
 from pydub import AudioSegment
 import scipy
 import io
 from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
 from pathlib import Path
-import matplotlib
-matplotlib.use('Agg')  # Use non-interactive backend
-# Define emotion labels, tone mapping, and descriptions
 EMOTION_DESCRIPTIONS = {
-    "angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.",
     "disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.",
-    "fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.",
-    "happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.",
     "neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.",
-    "sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.",
-    "surprise": "Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic."
 }
-# Here we map emotion to a generalized tone (for example, negative or positive)
 TONE_MAPPING = {
     "positive": ["happy", "surprise"],
-    "neutral": ["neutral"],
-    "negative": ["angry", "sad", "fear", "disgust"]
 }
-# Some Hugging Face models return short labels (e.g., "hap", "ang", etc.).
-# This mapping will ensure they're translated into our full canonical labels.
 MODEL_TO_EMOTION_MAP = {
     "hap": "happy",
     "ang": "angry",
@@ -79,19 +113,18 @@ MODEL_TO_EMOTION_MAP = {
     "dis": "disgust",
     "fea": "fear",
     "neu": "neutral",
-    "sur": "surprise"
 }
-# Global variable for the emotion classifier
 audio_emotion_classifier = None
 def load_emotion_model():
-    """Load the emotion classification model once and cache it."""
     global audio_emotion_classifier
     if audio_emotion_classifier is None:
         try:
             print("Loading emotion classification model...")
-            # Using the Hugging Face pipeline with the new model that classifies speech emotion
             model_name = "superb/hubert-large-superb-er"
             audio_emotion_classifier = pipeline("audio-classification", model=model_name)
             print("Emotion classification model loaded successfully")
@@ -101,359 +134,255 @@ def load_emotion_model():
             return False
     return True
 def convert_audio_to_wav(audio_file):
-    """Convert the uploaded audio to WAV format."""
     try:
         audio = AudioSegment.from_file(audio_file)
-        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
-            wav_path = temp_wav.name
-            audio.export(wav_path, format="wav")
-        return wav_path
     except Exception as e:
         print(f"Error converting audio: {e}")
         return None
 def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=5):
-    """
-    Analyze emotions in an audio file by processing it in chunks.
-    Returns a visualization, processed audio path, summary, and detailed results.
-    """
     if not load_emotion_model():
-        return None, "Failed to load emotion classification model. Please check console for details."
-    # If the file is already a WAV, use it directly; else convert it.
-    if audio_file.endswith('.wav'):
-        audio_path = audio_file
-    else:
-        audio_path = convert_audio_to_wav(audio_file)
-        if not audio_path:
-            return None, "Failed to process audio file. Unsupported format or corrupted file."
     try:
-        # Load the audio using librosa
         audio_data, sample_rate = librosa.load(audio_path, sr=16000)
-        duration = len(audio_data) / sample_rate
-        # Process in chunks for long files
         chunk_samples = int(chunk_duration * sample_rate)
         num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))
-        all_emotions = []
-        time_points = []
         for i in range(num_chunks):
             progress((i + 1) / num_chunks, "Analyzing audio emotions...")
             start_idx = i * chunk_samples
             end_idx = min(start_idx + chunk_samples, len(audio_data))
             chunk = audio_data[start_idx:end_idx]
-            # Skip too-short chunks (<0.5 seconds)
             if len(chunk) < 0.5 * sample_rate:
                 continue
-            # Create a temporary file for this audio chunk
-            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_chunk:
-                chunk_path = temp_chunk.name
                 scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))
-            # Get emotion classification results on this chunk
             results = audio_emotion_classifier(chunk_path)
-            os.unlink(chunk_path)  # Remove the temporary file
             all_emotions.append(results)
             time_points.append((start_idx / sample_rate, end_idx / sample_rate))
-        # Generate visualization and summary
-        fig, detailed_results = generate_emotion_timeline(all_emotions, time_points, duration)
-        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
-            img_path = temp_img.name
             fig.savefig(img_path, dpi=100, bbox_inches='tight')
             plt.close(fig)
         summary = generate_emotion_summary(all_emotions, time_points)
         return img_path, audio_path, summary, detailed_results
     except Exception as e:
-        print(f"Error analyzing audio: {e}")
         import traceback
         traceback.print_exc()
         return None, None, f"Error analyzing audio: {str(e)}", None
 def generate_emotion_timeline(all_emotions, time_points, duration):
-    """
-    Generate a bar chart visualization of emotion percentages with tone analysis.
-    Returns the matplotlib figure and a list of detailed results.
-    """
-    # All possible emotion labels from our dictionary
     emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
-    # We'll accumulate counts based on our canonical labels (e.g., "happy", "angry").
     emotion_counts = {}
     for emotions in all_emotions:
         if not emotions:
             continue
-        # The pipeline returns items like {"label": "Hap", "score": 0.95}, etc.
-        top_emotion = max(emotions, key=lambda x: x['score'])
-        # Normalize the label from the model to a canonical label used in EMOTION_DESCRIPTIONS
-        raw_label = top_emotion['label'].lower().strip()  # e.g., "hap", "ang", ...
-        canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label)
-        # If there's no mapping, we leave it as raw_label.
-        # But typically, it should be one of "happy", "angry", "disgust", "fear", "sad", "neutral", "surprise".
-        # Count how many times each canonical label appears
-        emotion_counts[canonical_label] = emotion_counts.get(canonical_label, 0) + 1
-    total_chunks = len(all_emotions)
-    emotion_percentages = {
-        e: (count / total_chunks * 100) for e, count in emotion_counts.items()
-    }
-    # Create empty percentages for emotions that didn't appear
-    for label in emotion_labels:
-        if label not in emotion_percentages:
-            emotion_percentages[label] = 0.0
-    # Sort emotions by percentage
     sorted_emotions = sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True)
-    # Create the bar chart with subplots: one for emotions and one for tone
-    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[3, 1], gridspec_kw={'hspace': 0.3})
-    # Capitalize each label for a nice display
-    emotions = [item[0].capitalize() for item in sorted_emotions]
     percentages = [item[1] for item in sorted_emotions]
-    # Custom colors for emotions (enough for 7 emotions)
     colors = ['red', 'brown', 'purple', 'green', 'gray', 'blue', 'orange']
-    if len(emotions) <= len(colors):
-        bar_colors = colors[:len(emotions)]
-    else:
-        # fallback if there's more emotions than colors
-        bar_colors = colors + ['#666666'] * (len(emotions) - len(colors))
-    # Plot emotion bars
-    bars = ax1.bar(emotions, percentages, color=bar_colors)
-    # Add percentage labels on top of each bar
     for bar in bars:
-        height = bar.get_height()
-        ax1.annotate(f'{height:.1f}%',
-                     xy=(bar.get_x() + bar.get_width() / 2, height),
-                     xytext=(0, 3),  # 3 points vertical offset
-                     textcoords="offset points",
                      ha='center', va='bottom')
-    ax1.set_ylim(0, 100)  # Fixed 100% scale
     ax1.set_ylabel('Percentage (%)')
     ax1.set_title('Emotion Distribution')
     ax1.grid(axis='y', linestyle='--', alpha=0.7)
-    # Calculate tone percentages based on the canonical labels we found
     tone_percentages = {"positive": 0, "neutral": 0, "negative": 0}
-    for emotion_label, percentage in emotion_percentages.items():
-        for tone, emotions_list in TONE_MAPPING.items():
-            if emotion_label in emotions_list:
-                tone_percentages[tone] += percentage
-    # Plot tone bars
-    tones = list(tone_percentages.keys())
-    tone_values = list(tone_percentages.values())
     tone_colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'}
-    tone_bars = ax2.bar(tones, tone_values, color=[tone_colors[t] for t in tones])
-    # Add percentage labels on tone bars
     for bar in tone_bars:
-        height = bar.get_height()
-        if height > 0:  # Only add label if there's a visible bar
-            ax2.annotate(f'{height:.1f}%',
-                         xy=(bar.get_x() + bar.get_width() / 2, height),
-                         xytext=(0, 3),
-                         textcoords="offset points",
                          ha='center', va='bottom')
     ax2.set_ylim(0, 100)
     ax2.set_ylabel('Percentage (%)')
     ax2.set_title('Tone Analysis')
     ax2.grid(axis='y', linestyle='--', alpha=0.7)
     plt.tight_layout()
-    # Generate a more detailed time-segmented result
     detailed_results = []
-    for idx, (emotions, (start_time, end_time)) in enumerate(zip(all_emotions, time_points)):
         if not emotions:
             continue
-        top_emotion = max(emotions, key=lambda x: x['score'])
-        raw_label = top_emotion['label'].lower().strip()
-        canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label)
-        # Determine the tone for this emotion
-        # (based on canonical_label rather than the raw model label)
-        tone = next((t for t, e_list in TONE_MAPPING.items() if canonical_label in e_list), "unknown")
         detailed_results.append({
-            'Time Range': f"{start_time:.1f}s - {end_time:.1f}s",
-            'Emotion': canonical_label,
-            'Tone': tone.capitalize(),
-            'Confidence': f"{top_emotion['score']:.2f}",
-            'Description': EMOTION_DESCRIPTIONS.get(canonical_label, "")
         })
     return fig, detailed_results
 def generate_emotion_summary(all_emotions, time_points):
-    """
-    Create a summary text from the emotion analysis.
-    Counts occurrences and computes percentages of the dominant emotion.
-    """
     if not all_emotions:
         return "No emotional content detected."
     emotion_counts = {}
-    total_chunks = len(all_emotions)
     for emotions in all_emotions:
         if not emotions:
             continue
-        top_emotion = max(emotions, key=lambda x: x['score'])
-        # Normalize the label
-        raw_label = top_emotion['label'].lower().strip()
-        canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label)
-        emotion_counts[canonical_label] = emotion_counts.get(canonical_label, 0) + 1
-    emotion_percentages = {
-        e: (count / total_chunks * 100)
-        for e, count in emotion_counts.items()
-    }
-    if not emotion_percentages:
         return "No emotional content detected."
-    # Find the dominant emotion (highest percentage)
-    dominant_emotion = max(emotion_percentages.items(), key=lambda x: x[1])[0]
-    summary = f"### Voice Emotion Analysis Summary\n\n"
-    summary += f"**Dominant emotion:** {dominant_emotion.capitalize()} ({emotion_percentages[dominant_emotion]:.1f}%)\n\n"
-    summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant_emotion, '')}\n\n"
     summary += "**Emotion distribution:**\n"
-    for emotion, percentage in sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True):
-        summary += f"- {emotion.capitalize()}: {percentage:.1f}%\n"
-    summary += "\n**Interpretation:** The voice predominantly expresses {0} emotion".format(dominant_emotion)
     return summary
-def record_audio(audio):
-    """Save recorded audio and analyze emotions."""
-    try:
-        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
-            audio_path = temp_file.name
-            with open(audio_path, 'wb') as f:
-                f.write(audio)
-        return audio_path
-    except Exception as e:
-        print(f"Error saving recorded audio: {e}")
-        return None
 def process_audio(audio_file, progress=gr.Progress()):
-    """Process the audio file and analyze emotions."""
     if audio_file is None:
         return None, None, "No audio file provided.", None
     img_path, processed_audio, summary, results = analyze_audio_emotions(audio_file, progress)
     if img_path is None:
-        return None, None, "Failed to analyze audio emotions.", None
     return img_path, processed_audio, summary, results
-# Create Gradio interface
 with gr.Blocks(title="Voice Emotion Analysis System") as demo:
     gr.Markdown("""
     # 🎙️ Voice Emotion Analysis System
     This app analyzes the emotional content of voice recordings.
     It detects emotions including:
-    * 😡 **Anger**
-    * 🤢 **Disgust**
-    * 😨 **Fear**
-    * 😊 **Happiness**
-    * 😐 **Neutral**
-    * 😢 **Sadness**
-    * 😲 **Surprise**
     And provides a detailed analysis and timeline.
     """)
     with gr.Tabs():
         with gr.TabItem("Upload Audio"):
             with gr.Row():
                 with gr.Column(scale=1):
-                    audio_input = gr.Audio(
-                        label="Upload Audio File",
-                        type="filepath",
-                        sources=["upload"]
-                    )
                     process_btn = gr.Button("Analyze Voice Emotions")
                 with gr.Column(scale=2):
-                    emotion_timeline = gr.Image(label="Emotion Timeline", show_label=True)
             with gr.Row():
-                audio_playback = gr.Audio(label="Processed Audio", show_label=True)
                 emotion_summary = gr.Markdown(label="Emotion Summary")
             with gr.Row():
                 emotion_results = gr.DataFrame(
                     headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
-                    label="Detailed Emotion Analysis"
                 )
             process_btn.click(
                 fn=process_audio,
                 inputs=[audio_input],
-                outputs=[emotion_timeline, audio_playback, emotion_summary, emotion_results]
             )
         with gr.TabItem("Record Voice"):
             with gr.Row():
                 with gr.Column(scale=1):
-                    record_input = gr.Audio(
-                        label="Record Your Voice",
-                        sources=["microphone"],
-                        type="filepath"
-                    )
                     analyze_btn = gr.Button("Analyze Recording")
                 with gr.Column(scale=2):
-                    rec_emotion_timeline = gr.Image(label="Emotion Timeline", show_label=True)
             with gr.Row():
-                rec_audio_playback = gr.Audio(label="Processed Audio", show_label=True)
                 rec_emotion_summary = gr.Markdown(label="Emotion Summary")
             with gr.Row():
                 rec_emotion_results = gr.DataFrame(
                     headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
-                    label="Detailed Emotion Analysis"
                 )
             analyze_btn.click(
                 fn=process_audio,
                 inputs=[record_input],
-                outputs=[rec_emotion_timeline, rec_audio_playback, rec_emotion_summary, rec_emotion_results]
             )
     gr.Markdown("""
     ### How to Use
     1. **Upload Audio Tab:** Upload an audio file and click "Analyze Voice Emotions".
     2. **Record Voice Tab:** Record your voice and click "Analyze Recording".
     **Tips:**
     - Use clear recordings with minimal background noise.
     - Longer recordings yield more consistent results.
     """)
 def initialize_app():
     print("Initializing voice emotion analysis app...")
     if load_emotion_model():
@@ -461,6 +390,7 @@ def initialize_app():
     else:
         print("Failed to load emotion model.")
 if __name__ == "__main__":
     initialize_app()
-    demo.launch()

 import os
 import subprocess
 import sys
 import time
 import tempfile
 import warnings
 warnings.filterwarnings("ignore")
+def run_pip(*args):
+    """Run a pip install command and raise on failure."""
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir"] + list(args))
+# ── Phase 1: Install packages ─────────────────────────────────────────────────
+# FIX 1: Use importlib-based checks instead of deprecated pkg_resources.
+# FIX 2: torch → CPU-only wheel (~190 MB vs ~900 MB CUDA) to avoid disk quota.
+# FIX 3: transformers pinned to 4.46.3 (last v4); v5 dropped audio-classification
+#         pipeline support for many models AND is much larger on disk.
+# FIX 4: torchaudio pulled without the CUDA index so it stays CPU-only too.
+print("=== Installing gradio (if needed) ===")
+try:
+    import gradio  # noqa: F401
+    print("gradio already installed.")
+except ImportError:
+    run_pip("gradio")
+print("=== Installing torch CPU-only (if needed) ===")
+try:
+    import torch  # noqa: F401
+    print("torch already installed.")
+except ImportError:
+    run_pip("torch", "torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu")
+print("=== Installing torchaudio (if needed) ===")
+try:
+    import torchaudio  # noqa: F401
+    print("torchaudio already installed.")
+except ImportError:
+    run_pip("torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu")
+print("=== Installing transformers 4.46.3 (if needed) ===")
+try:
+    import transformers as _tf
+    if _tf.__version__ != "4.46.3":
+        raise ImportError("wrong version")
+    print("transformers 4.46.3 already installed.")
+except (ImportError, AttributeError):
+    run_pip("transformers==4.46.3")
+print("=== Installing remaining packages (if needed) ===")
+for pkg in ["librosa", "scipy", "matplotlib", "pydub"]:
     try:
+        __import__(pkg)
+        print(f"{pkg} already installed.")
+    except ImportError:
+        run_pip(pkg)
+# ── Phase 2: Patch transformers get_session → requests.Session ───────────────
+# transformers 4.46.3 calls get_session().head(..., allow_redirects=, proxies=, ...)
+# In this environment get_session() returns an httpx.Client (gradio depends on
+# httpx), which rejects every requests-style kwarg.
+# Fix: replace get_session in the already-imported module namespace so it always
+# returns a plain requests.Session, which accepts all those kwargs natively.
+import transformers.utils.hub as _t_hub  # noqa: E402
+import requests as _requests              # noqa: E402
+_t_hub.get_session = lambda: _requests.Session()
+print("Patched transformers.utils.hub.get_session → requests.Session()")
+# ── Phase 3: Safe imports ─────────────────────────────────────────────────────
+import numpy as np
 import gradio as gr
 import torch
 import torchaudio
 import librosa
+import matplotlib
+matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 from pydub import AudioSegment
 import scipy
 import io
 from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
 from pathlib import Path
+# ── Emotion metadata ──────────────────────────────────────────────────────────
 EMOTION_DESCRIPTIONS = {
+    "angry":   "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.",
     "disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.",
+    "fear":    "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.",
+    "happy":   "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.",
     "neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.",
+    "sad":     "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.",
+    "surprise":"Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic.",
 }
 TONE_MAPPING = {
     "positive": ["happy", "surprise"],
+    "neutral":  ["neutral"],
+    "negative": ["angry", "sad", "fear", "disgust"],
 }
 MODEL_TO_EMOTION_MAP = {
     "hap": "happy",
     "ang": "angry",
     "dis": "disgust",
     "fea": "fear",
     "neu": "neutral",
+    "sur": "surprise",
 }
+# ── Model loading ─────────────────────────────────────────────────────────────
 audio_emotion_classifier = None
 def load_emotion_model():
     global audio_emotion_classifier
     if audio_emotion_classifier is None:
         try:
             print("Loading emotion classification model...")
             model_name = "superb/hubert-large-superb-er"
             audio_emotion_classifier = pipeline("audio-classification", model=model_name)
             print("Emotion classification model loaded successfully")
             return False
     return True
+# ── Audio helpers ─────────────────────────────────────────────────────────────
 def convert_audio_to_wav(audio_file):
     try:
         audio = AudioSegment.from_file(audio_file)
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
+            audio.export(tmp.name, format="wav")
+            return tmp.name
     except Exception as e:
         print(f"Error converting audio: {e}")
         return None
 def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=5):
     if not load_emotion_model():
+        return None, None, "Failed to load emotion classification model.", None
+    audio_path = audio_file if audio_file.endswith('.wav') else convert_audio_to_wav(audio_file)
+    if not audio_path:
+        return None, None, "Failed to process audio file. Unsupported format or corrupted file.", None
     try:
         audio_data, sample_rate = librosa.load(audio_path, sr=16000)
         chunk_samples = int(chunk_duration * sample_rate)
         num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))
+        all_emotions, time_points = [], []
         for i in range(num_chunks):
             progress((i + 1) / num_chunks, "Analyzing audio emotions...")
             start_idx = i * chunk_samples
             end_idx = min(start_idx + chunk_samples, len(audio_data))
             chunk = audio_data[start_idx:end_idx]
             if len(chunk) < 0.5 * sample_rate:
                 continue
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
+                chunk_path = tmp.name
                 scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))
             results = audio_emotion_classifier(chunk_path)
+            os.unlink(chunk_path)
             all_emotions.append(results)
             time_points.append((start_idx / sample_rate, end_idx / sample_rate))
+        fig, detailed_results = generate_emotion_timeline(all_emotions, time_points, len(audio_data) / sample_rate)
+        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
+            img_path = tmp.name
             fig.savefig(img_path, dpi=100, bbox_inches='tight')
             plt.close(fig)
         summary = generate_emotion_summary(all_emotions, time_points)
         return img_path, audio_path, summary, detailed_results
     except Exception as e:
         import traceback
         traceback.print_exc()
         return None, None, f"Error analyzing audio: {str(e)}", None
+# ── Visualisation & summary ───────────────────────────────────────────────────
 def generate_emotion_timeline(all_emotions, time_points, duration):
     emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
     emotion_counts = {}
     for emotions in all_emotions:
         if not emotions:
             continue
+        top = max(emotions, key=lambda x: x['score'])
+        raw = top['label'].lower().strip()
+        canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
+        emotion_counts[canonical] = emotion_counts.get(canonical, 0) + 1
+    total = len(all_emotions)
+    emotion_percentages = {e: (emotion_counts.get(e, 0) / total * 100) for e in emotion_labels}
     sorted_emotions = sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True)
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[3, 1],
+                                   gridspec_kw={'hspace': 0.3})
+    emotions_labels_disp = [item[0].capitalize() for item in sorted_emotions]
     percentages = [item[1] for item in sorted_emotions]
     colors = ['red', 'brown', 'purple', 'green', 'gray', 'blue', 'orange']
+    bar_colors = (colors + ['#666666'] * max(0, len(emotions_labels_disp) - len(colors)))[:len(emotions_labels_disp)]
+    bars = ax1.bar(emotions_labels_disp, percentages, color=bar_colors)
     for bar in bars:
+        h = bar.get_height()
+        ax1.annotate(f'{h:.1f}%',
+                     xy=(bar.get_x() + bar.get_width() / 2, h),
+                     xytext=(0, 3), textcoords="offset points",
                      ha='center', va='bottom')
+    ax1.set_ylim(0, 100)
     ax1.set_ylabel('Percentage (%)')
     ax1.set_title('Emotion Distribution')
     ax1.grid(axis='y', linestyle='--', alpha=0.7)
     tone_percentages = {"positive": 0, "neutral": 0, "negative": 0}
+    for emotion, pct in emotion_percentages.items():
+        for tone, elist in TONE_MAPPING.items():
+            if emotion in elist:
+                tone_percentages[tone] += pct
     tone_colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'}
+    tone_bars = ax2.bar(list(tone_percentages.keys()),
+                        list(tone_percentages.values()),
+                        color=[tone_colors[t] for t in tone_percentages])
     for bar in tone_bars:
+        h = bar.get_height()
+        if h > 0:
+            ax2.annotate(f'{h:.1f}%',
+                         xy=(bar.get_x() + bar.get_width() / 2, h),
+                         xytext=(0, 3), textcoords="offset points",
                          ha='center', va='bottom')
     ax2.set_ylim(0, 100)
     ax2.set_ylabel('Percentage (%)')
     ax2.set_title('Tone Analysis')
     ax2.grid(axis='y', linestyle='--', alpha=0.7)
     plt.tight_layout()
     detailed_results = []
+    for emotions, (start_time, end_time) in zip(all_emotions, time_points):
         if not emotions:
             continue
+        top = max(emotions, key=lambda x: x['score'])
+        raw = top['label'].lower().strip()
+        canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
+        tone = next((t for t, el in TONE_MAPPING.items() if canonical in el), "unknown")
         detailed_results.append({
+            'Time Range':   f"{start_time:.1f}s - {end_time:.1f}s",
+            'Emotion':      canonical,
+            'Tone':         tone.capitalize(),
+            'Confidence':   f"{top['score']:.2f}",
+            'Description':  EMOTION_DESCRIPTIONS.get(canonical, ""),
         })
     return fig, detailed_results
 def generate_emotion_summary(all_emotions, time_points):
     if not all_emotions:
         return "No emotional content detected."
     emotion_counts = {}
+    total = len(all_emotions)
     for emotions in all_emotions:
         if not emotions:
             continue
+        top = max(emotions, key=lambda x: x['score'])
+        raw = top['label'].lower().strip()
+        canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
+        emotion_counts[canonical] = emotion_counts.get(canonical, 0) + 1
+    if not emotion_counts:
         return "No emotional content detected."
+    emotion_percentages = {e: (c / total * 100) for e, c in emotion_counts.items()}
+    dominant = max(emotion_percentages, key=lambda x: emotion_percentages[x])
+    summary = "### Voice Emotion Analysis Summary\n\n"
+    summary += f"**Dominant emotion:** {dominant.capitalize()} ({emotion_percentages[dominant]:.1f}%)\n\n"
+    summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant, '')}\n\n"
     summary += "**Emotion distribution:**\n"
+    for emotion, pct in sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True):
+        summary += f"- {emotion.capitalize()}: {pct:.1f}%\n"
+    summary += f"\n**Interpretation:** The voice predominantly expresses {dominant} emotion"
     return summary
+# ── Gradio handlers ───────────────────────────────────────────────────────────
 def process_audio(audio_file, progress=gr.Progress()):
     if audio_file is None:
         return None, None, "No audio file provided.", None
     img_path, processed_audio, summary, results = analyze_audio_emotions(audio_file, progress)
     if img_path is None:
+        return None, None, summary or "Failed to analyze audio emotions.", None
     return img_path, processed_audio, summary, results
+# ── Gradio UI ─────────────────────────────────────────────────────────────────
 with gr.Blocks(title="Voice Emotion Analysis System") as demo:
     gr.Markdown("""
     # 🎙️ Voice Emotion Analysis System
     This app analyzes the emotional content of voice recordings.
     It detects emotions including:
+    * 😡 **Anger** &nbsp; 🤢 **Disgust** &nbsp; 😨 **Fear** &nbsp; 😊 **Happiness**
+    * 😐 **Neutral** &nbsp; 😢 **Sadness** &nbsp; 😲 **Surprise**
     And provides a detailed analysis and timeline.
     """)
     with gr.Tabs():
         with gr.TabItem("Upload Audio"):
             with gr.Row():
                 with gr.Column(scale=1):
+                    audio_input = gr.Audio(label="Upload Audio File", type="filepath", sources=["upload"])
                     process_btn = gr.Button("Analyze Voice Emotions")
                 with gr.Column(scale=2):
+                    emotion_timeline = gr.Image(label="Emotion Timeline")
             with gr.Row():
+                audio_playback = gr.Audio(label="Processed Audio")
                 emotion_summary = gr.Markdown(label="Emotion Summary")
             with gr.Row():
                 emotion_results = gr.DataFrame(
                     headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
+                    label="Detailed Emotion Analysis",
                 )
             process_btn.click(
                 fn=process_audio,
                 inputs=[audio_input],
+                outputs=[emotion_timeline, audio_playback, emotion_summary, emotion_results],
             )
         with gr.TabItem("Record Voice"):
             with gr.Row():
                 with gr.Column(scale=1):
+                    record_input = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath")
                     analyze_btn = gr.Button("Analyze Recording")
                 with gr.Column(scale=2):
+                    rec_emotion_timeline = gr.Image(label="Emotion Timeline")
             with gr.Row():
+                rec_audio_playback = gr.Audio(label="Processed Audio")
                 rec_emotion_summary = gr.Markdown(label="Emotion Summary")
             with gr.Row():
                 rec_emotion_results = gr.DataFrame(
                     headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
+                    label="Detailed Emotion Analysis",
                 )
             analyze_btn.click(
                 fn=process_audio,
                 inputs=[record_input],
+                outputs=[rec_emotion_timeline, rec_audio_playback, rec_emotion_summary, rec_emotion_results],
             )
     gr.Markdown("""
     ### How to Use
     1. **Upload Audio Tab:** Upload an audio file and click "Analyze Voice Emotions".
     2. **Record Voice Tab:** Record your voice and click "Analyze Recording".
     **Tips:**
     - Use clear recordings with minimal background noise.
     - Longer recordings yield more consistent results.
     """)
 def initialize_app():
     print("Initializing voice emotion analysis app...")
     if load_emotion_model():
     else:
         print("Failed to load emotion model.")
 if __name__ == "__main__":
     initialize_app()
+    demo.launch()