# voice_emotion_classification.py import os import subprocess import sys import time import tempfile import warnings warnings.filterwarnings("ignore") def run_pip(*args): """Run a pip install command and raise on failure.""" subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir"] + list(args)) # ── Phase 1: Install packages ───────────────────────────────────────────────── # FIX 1: Use importlib-based checks instead of deprecated pkg_resources. # FIX 2: torch → CPU-only wheel (~190 MB vs ~900 MB CUDA) to avoid disk quota. # FIX 3: transformers pinned to 4.46.3 (last v4); v5 dropped audio-classification # pipeline support for many models AND is much larger on disk. # FIX 4: torchaudio pulled without the CUDA index so it stays CPU-only too. print("=== Installing gradio (if needed) ===") try: import gradio # noqa: F401 print("gradio already installed.") except ImportError: run_pip("gradio") print("=== Installing torch CPU-only (if needed) ===") try: import torch # noqa: F401 print("torch already installed.") except ImportError: run_pip("torch", "torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu") print("=== Installing torchaudio (if needed) ===") try: import torchaudio # noqa: F401 print("torchaudio already installed.") except ImportError: run_pip("torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu") print("=== Installing transformers 4.46.3 (if needed) ===") try: import transformers as _tf if _tf.__version__ != "4.46.3": raise ImportError("wrong version") print("transformers 4.46.3 already installed.") except (ImportError, AttributeError): run_pip("transformers==4.46.3") print("=== Installing remaining packages (if needed) ===") for pkg in ["librosa", "scipy", "matplotlib", "pydub"]: try: __import__(pkg) print(f"{pkg} already installed.") except ImportError: run_pip(pkg) # ── Phase 2: Patch transformers get_session → requests.Session ─────────────── # transformers 4.46.3 calls get_session().head(..., allow_redirects=, proxies=, ...) # In this environment get_session() returns an httpx.Client (gradio depends on # httpx), which rejects every requests-style kwarg. # Fix: replace get_session in the already-imported module namespace so it always # returns a plain requests.Session, which accepts all those kwargs natively. import transformers.utils.hub as _t_hub # noqa: E402 import requests as _requests # noqa: E402 _t_hub.get_session = lambda: _requests.Session() print("Patched transformers.utils.hub.get_session → requests.Session()") # ── Phase 3: Safe imports ───────────────────────────────────────────────────── import numpy as np import gradio as gr import torch import torchaudio import librosa import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from pydub import AudioSegment import scipy import io from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification from pathlib import Path # ── Emotion metadata ────────────────────────────────────────────────────────── EMOTION_DESCRIPTIONS = { "angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.", "disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.", "fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.", "happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.", "neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.", "sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.", "surprise":"Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic.", } TONE_MAPPING = { "positive": ["happy", "surprise"], "neutral": ["neutral"], "negative": ["angry", "sad", "fear", "disgust"], } MODEL_TO_EMOTION_MAP = { "hap": "happy", "ang": "angry", "sad": "sad", "dis": "disgust", "fea": "fear", "neu": "neutral", "sur": "surprise", } # ── Model loading ───────────────────────────────────────────────────────────── audio_emotion_classifier = None def load_emotion_model(): global audio_emotion_classifier if audio_emotion_classifier is None: try: print("Loading emotion classification model...") model_name = "superb/hubert-large-superb-er" audio_emotion_classifier = pipeline("audio-classification", model=model_name) print("Emotion classification model loaded successfully") return True except Exception as e: print(f"Error loading emotion model: {e}") return False return True # ── Audio helpers ───────────────────────────────────────────────────────────── def convert_audio_to_wav(audio_file): try: audio = AudioSegment.from_file(audio_file) with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: audio.export(tmp.name, format="wav") return tmp.name except Exception as e: print(f"Error converting audio: {e}") return None def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=5): if not load_emotion_model(): return None, None, "Failed to load emotion classification model.", None audio_path = audio_file if audio_file.endswith('.wav') else convert_audio_to_wav(audio_file) if not audio_path: return None, None, "Failed to process audio file. Unsupported format or corrupted file.", None try: audio_data, sample_rate = librosa.load(audio_path, sr=16000) chunk_samples = int(chunk_duration * sample_rate) num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples))) all_emotions, time_points = [], [] for i in range(num_chunks): progress((i + 1) / num_chunks, "Analyzing audio emotions...") start_idx = i * chunk_samples end_idx = min(start_idx + chunk_samples, len(audio_data)) chunk = audio_data[start_idx:end_idx] if len(chunk) < 0.5 * sample_rate: continue with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: chunk_path = tmp.name scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16)) results = audio_emotion_classifier(chunk_path) os.unlink(chunk_path) all_emotions.append(results) time_points.append((start_idx / sample_rate, end_idx / sample_rate)) fig, detailed_results = generate_emotion_timeline(all_emotions, time_points, len(audio_data) / sample_rate) with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp: img_path = tmp.name fig.savefig(img_path, dpi=100, bbox_inches='tight') plt.close(fig) summary = generate_emotion_summary(all_emotions, time_points) return img_path, audio_path, summary, detailed_results except Exception as e: import traceback traceback.print_exc() return None, None, f"Error analyzing audio: {str(e)}", None # ── Visualisation & summary ─────────────────────────────────────────────────── def generate_emotion_timeline(all_emotions, time_points, duration): emotion_labels = list(EMOTION_DESCRIPTIONS.keys()) emotion_counts = {} for emotions in all_emotions: if not emotions: continue top = max(emotions, key=lambda x: x['score']) raw = top['label'].lower().strip() canonical = MODEL_TO_EMOTION_MAP.get(raw, raw) emotion_counts[canonical] = emotion_counts.get(canonical, 0) + 1 total = len(all_emotions) emotion_percentages = {e: (emotion_counts.get(e, 0) / total * 100) for e in emotion_labels} sorted_emotions = sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True) fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[3, 1], gridspec_kw={'hspace': 0.3}) emotions_labels_disp = [item[0].capitalize() for item in sorted_emotions] percentages = [item[1] for item in sorted_emotions] colors = ['red', 'brown', 'purple', 'green', 'gray', 'blue', 'orange'] bar_colors = (colors + ['#666666'] * max(0, len(emotions_labels_disp) - len(colors)))[:len(emotions_labels_disp)] bars = ax1.bar(emotions_labels_disp, percentages, color=bar_colors) for bar in bars: h = bar.get_height() ax1.annotate(f'{h:.1f}%', xy=(bar.get_x() + bar.get_width() / 2, h), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom') ax1.set_ylim(0, 100) ax1.set_ylabel('Percentage (%)') ax1.set_title('Emotion Distribution') ax1.grid(axis='y', linestyle='--', alpha=0.7) tone_percentages = {"positive": 0, "neutral": 0, "negative": 0} for emotion, pct in emotion_percentages.items(): for tone, elist in TONE_MAPPING.items(): if emotion in elist: tone_percentages[tone] += pct tone_colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'} tone_bars = ax2.bar(list(tone_percentages.keys()), list(tone_percentages.values()), color=[tone_colors[t] for t in tone_percentages]) for bar in tone_bars: h = bar.get_height() if h > 0: ax2.annotate(f'{h:.1f}%', xy=(bar.get_x() + bar.get_width() / 2, h), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom') ax2.set_ylim(0, 100) ax2.set_ylabel('Percentage (%)') ax2.set_title('Tone Analysis') ax2.grid(axis='y', linestyle='--', alpha=0.7) plt.tight_layout() detailed_results = [] for emotions, (start_time, end_time) in zip(all_emotions, time_points): if not emotions: continue top = max(emotions, key=lambda x: x['score']) raw = top['label'].lower().strip() canonical = MODEL_TO_EMOTION_MAP.get(raw, raw) tone = next((t for t, el in TONE_MAPPING.items() if canonical in el), "unknown") detailed_results.append({ 'Time Range': f"{start_time:.1f}s - {end_time:.1f}s", 'Emotion': canonical, 'Tone': tone.capitalize(), 'Confidence': f"{top['score']:.2f}", 'Description': EMOTION_DESCRIPTIONS.get(canonical, ""), }) return fig, detailed_results def generate_emotion_summary(all_emotions, time_points): if not all_emotions: return "No emotional content detected." emotion_counts = {} total = len(all_emotions) for emotions in all_emotions: if not emotions: continue top = max(emotions, key=lambda x: x['score']) raw = top['label'].lower().strip() canonical = MODEL_TO_EMOTION_MAP.get(raw, raw) emotion_counts[canonical] = emotion_counts.get(canonical, 0) + 1 if not emotion_counts: return "No emotional content detected." emotion_percentages = {e: (c / total * 100) for e, c in emotion_counts.items()} dominant = max(emotion_percentages, key=lambda x: emotion_percentages[x]) summary = "### Voice Emotion Analysis Summary\n\n" summary += f"**Dominant emotion:** {dominant.capitalize()} ({emotion_percentages[dominant]:.1f}%)\n\n" summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant, '')}\n\n" summary += "**Emotion distribution:**\n" for emotion, pct in sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True): summary += f"- {emotion.capitalize()}: {pct:.1f}%\n" summary += f"\n**Interpretation:** The voice predominantly expresses {dominant} emotion" return summary # ── Gradio handlers ─────────────────────────────────────────────────────────── def process_audio(audio_file, progress=gr.Progress()): if audio_file is None: return None, None, "No audio file provided.", None img_path, processed_audio, summary, results = analyze_audio_emotions(audio_file, progress) if img_path is None: return None, None, summary or "Failed to analyze audio emotions.", None return img_path, processed_audio, summary, results # ── Gradio UI ───────────────────────────────────────────────────────────────── with gr.Blocks(title="Voice Emotion Analysis System") as demo: gr.Markdown(""" # 🎙️ Voice Emotion Analysis System This app analyzes the emotional content of voice recordings. It detects emotions including: * 😡 **Anger**   🤢 **Disgust**   😨 **Fear**   😊 **Happiness** * 😐 **Neutral**   😢 **Sadness**   😲 **Surprise** And provides a detailed analysis and timeline. """) with gr.Tabs(): with gr.TabItem("Upload Audio"): with gr.Row(): with gr.Column(scale=1): audio_input = gr.Audio(label="Upload Audio File", type="filepath", sources=["upload"]) process_btn = gr.Button("Analyze Voice Emotions") with gr.Column(scale=2): emotion_timeline = gr.Image(label="Emotion Timeline") with gr.Row(): audio_playback = gr.Audio(label="Processed Audio") emotion_summary = gr.Markdown(label="Emotion Summary") with gr.Row(): emotion_results = gr.DataFrame( headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"], label="Detailed Emotion Analysis", ) process_btn.click( fn=process_audio, inputs=[audio_input], outputs=[emotion_timeline, audio_playback, emotion_summary, emotion_results], ) with gr.TabItem("Record Voice"): with gr.Row(): with gr.Column(scale=1): record_input = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath") analyze_btn = gr.Button("Analyze Recording") with gr.Column(scale=2): rec_emotion_timeline = gr.Image(label="Emotion Timeline") with gr.Row(): rec_audio_playback = gr.Audio(label="Processed Audio") rec_emotion_summary = gr.Markdown(label="Emotion Summary") with gr.Row(): rec_emotion_results = gr.DataFrame( headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"], label="Detailed Emotion Analysis", ) analyze_btn.click( fn=process_audio, inputs=[record_input], outputs=[rec_emotion_timeline, rec_audio_playback, rec_emotion_summary, rec_emotion_results], ) gr.Markdown(""" ### How to Use 1. **Upload Audio Tab:** Upload an audio file and click "Analyze Voice Emotions". 2. **Record Voice Tab:** Record your voice and click "Analyze Recording". **Tips:** - Use clear recordings with minimal background noise. - Longer recordings yield more consistent results. """) def initialize_app(): print("Initializing voice emotion analysis app...") if load_emotion_model(): print("Emotion model loaded successfully!") else: print("Failed to load emotion model.") if __name__ == "__main__": initialize_app() demo.launch()