Spaces:
Sleeping
Sleeping
| # voice_emotion_classification.py | |
| import os | |
| import subprocess | |
| import sys | |
| import time | |
| import tempfile | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| def run_pip(*args): | |
| """Run a pip install command and raise on failure.""" | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir"] + list(args)) | |
| # ββ Phase 1: Install packages βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # FIX 1: Use importlib-based checks instead of deprecated pkg_resources. | |
| # FIX 2: torch β CPU-only wheel (~190 MB vs ~900 MB CUDA) to avoid disk quota. | |
| # FIX 3: transformers pinned to 4.46.3 (last v4); v5 dropped audio-classification | |
| # pipeline support for many models AND is much larger on disk. | |
| # FIX 4: torchaudio pulled without the CUDA index so it stays CPU-only too. | |
| print("=== Installing gradio (if needed) ===") | |
| try: | |
| import gradio # noqa: F401 | |
| print("gradio already installed.") | |
| except ImportError: | |
| run_pip("gradio") | |
| print("=== Installing torch CPU-only (if needed) ===") | |
| try: | |
| import torch # noqa: F401 | |
| print("torch already installed.") | |
| except ImportError: | |
| run_pip("torch", "torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu") | |
| print("=== Installing torchaudio (if needed) ===") | |
| try: | |
| import torchaudio # noqa: F401 | |
| print("torchaudio already installed.") | |
| except ImportError: | |
| run_pip("torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu") | |
| print("=== Installing transformers 4.46.3 (if needed) ===") | |
| try: | |
| import transformers as _tf | |
| if _tf.__version__ != "4.46.3": | |
| raise ImportError("wrong version") | |
| print("transformers 4.46.3 already installed.") | |
| except (ImportError, AttributeError): | |
| run_pip("transformers==4.46.3") | |
| print("=== Installing remaining packages (if needed) ===") | |
| for pkg in ["librosa", "scipy", "matplotlib", "pydub"]: | |
| try: | |
| __import__(pkg) | |
| print(f"{pkg} already installed.") | |
| except ImportError: | |
| run_pip(pkg) | |
| # ββ Phase 2: Patch transformers get_session β requests.Session βββββββββββββββ | |
| # transformers 4.46.3 calls get_session().head(..., allow_redirects=, proxies=, ...) | |
| # In this environment get_session() returns an httpx.Client (gradio depends on | |
| # httpx), which rejects every requests-style kwarg. | |
| # Fix: replace get_session in the already-imported module namespace so it always | |
| # returns a plain requests.Session, which accepts all those kwargs natively. | |
| import transformers.utils.hub as _t_hub # noqa: E402 | |
| import requests as _requests # noqa: E402 | |
| _t_hub.get_session = lambda: _requests.Session() | |
| print("Patched transformers.utils.hub.get_session β requests.Session()") | |
| # ββ Phase 3: Safe imports βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| import numpy as np | |
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import librosa | |
| import matplotlib | |
| matplotlib.use('Agg') | |
| import matplotlib.pyplot as plt | |
| from pydub import AudioSegment | |
| import scipy | |
| import io | |
| from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification | |
| from pathlib import Path | |
| # ββ Emotion metadata ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| EMOTION_DESCRIPTIONS = { | |
| "angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.", | |
| "disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.", | |
| "fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.", | |
| "happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.", | |
| "neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.", | |
| "sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.", | |
| "surprise":"Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic.", | |
| } | |
| TONE_MAPPING = { | |
| "positive": ["happy", "surprise"], | |
| "neutral": ["neutral"], | |
| "negative": ["angry", "sad", "fear", "disgust"], | |
| } | |
| MODEL_TO_EMOTION_MAP = { | |
| "hap": "happy", | |
| "ang": "angry", | |
| "sad": "sad", | |
| "dis": "disgust", | |
| "fea": "fear", | |
| "neu": "neutral", | |
| "sur": "surprise", | |
| } | |
| # ββ Model loading βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| audio_emotion_classifier = None | |
| def load_emotion_model(): | |
| global audio_emotion_classifier | |
| if audio_emotion_classifier is None: | |
| try: | |
| print("Loading emotion classification model...") | |
| model_name = "superb/hubert-large-superb-er" | |
| audio_emotion_classifier = pipeline("audio-classification", model=model_name) | |
| print("Emotion classification model loaded successfully") | |
| return True | |
| except Exception as e: | |
| print(f"Error loading emotion model: {e}") | |
| return False | |
| return True | |
| # ββ Audio helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def convert_audio_to_wav(audio_file): | |
| try: | |
| audio = AudioSegment.from_file(audio_file) | |
| with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: | |
| audio.export(tmp.name, format="wav") | |
| return tmp.name | |
| except Exception as e: | |
| print(f"Error converting audio: {e}") | |
| return None | |
| def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=5): | |
| if not load_emotion_model(): | |
| return None, None, "Failed to load emotion classification model.", None | |
| audio_path = audio_file if audio_file.endswith('.wav') else convert_audio_to_wav(audio_file) | |
| if not audio_path: | |
| return None, None, "Failed to process audio file. Unsupported format or corrupted file.", None | |
| try: | |
| audio_data, sample_rate = librosa.load(audio_path, sr=16000) | |
| chunk_samples = int(chunk_duration * sample_rate) | |
| num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples))) | |
| all_emotions, time_points = [], [] | |
| for i in range(num_chunks): | |
| progress((i + 1) / num_chunks, "Analyzing audio emotions...") | |
| start_idx = i * chunk_samples | |
| end_idx = min(start_idx + chunk_samples, len(audio_data)) | |
| chunk = audio_data[start_idx:end_idx] | |
| if len(chunk) < 0.5 * sample_rate: | |
| continue | |
| with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: | |
| chunk_path = tmp.name | |
| scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16)) | |
| results = audio_emotion_classifier(chunk_path) | |
| os.unlink(chunk_path) | |
| all_emotions.append(results) | |
| time_points.append((start_idx / sample_rate, end_idx / sample_rate)) | |
| fig, detailed_results = generate_emotion_timeline(all_emotions, time_points, len(audio_data) / sample_rate) | |
| with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp: | |
| img_path = tmp.name | |
| fig.savefig(img_path, dpi=100, bbox_inches='tight') | |
| plt.close(fig) | |
| summary = generate_emotion_summary(all_emotions, time_points) | |
| return img_path, audio_path, summary, detailed_results | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return None, None, f"Error analyzing audio: {str(e)}", None | |
| # ββ Visualisation & summary βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_emotion_timeline(all_emotions, time_points, duration): | |
| emotion_labels = list(EMOTION_DESCRIPTIONS.keys()) | |
| emotion_counts = {} | |
| for emotions in all_emotions: | |
| if not emotions: | |
| continue | |
| top = max(emotions, key=lambda x: x['score']) | |
| raw = top['label'].lower().strip() | |
| canonical = MODEL_TO_EMOTION_MAP.get(raw, raw) | |
| emotion_counts[canonical] = emotion_counts.get(canonical, 0) + 1 | |
| total = len(all_emotions) | |
| emotion_percentages = {e: (emotion_counts.get(e, 0) / total * 100) for e in emotion_labels} | |
| sorted_emotions = sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True) | |
| fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[3, 1], | |
| gridspec_kw={'hspace': 0.3}) | |
| emotions_labels_disp = [item[0].capitalize() for item in sorted_emotions] | |
| percentages = [item[1] for item in sorted_emotions] | |
| colors = ['red', 'brown', 'purple', 'green', 'gray', 'blue', 'orange'] | |
| bar_colors = (colors + ['#666666'] * max(0, len(emotions_labels_disp) - len(colors)))[:len(emotions_labels_disp)] | |
| bars = ax1.bar(emotions_labels_disp, percentages, color=bar_colors) | |
| for bar in bars: | |
| h = bar.get_height() | |
| ax1.annotate(f'{h:.1f}%', | |
| xy=(bar.get_x() + bar.get_width() / 2, h), | |
| xytext=(0, 3), textcoords="offset points", | |
| ha='center', va='bottom') | |
| ax1.set_ylim(0, 100) | |
| ax1.set_ylabel('Percentage (%)') | |
| ax1.set_title('Emotion Distribution') | |
| ax1.grid(axis='y', linestyle='--', alpha=0.7) | |
| tone_percentages = {"positive": 0, "neutral": 0, "negative": 0} | |
| for emotion, pct in emotion_percentages.items(): | |
| for tone, elist in TONE_MAPPING.items(): | |
| if emotion in elist: | |
| tone_percentages[tone] += pct | |
| tone_colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'} | |
| tone_bars = ax2.bar(list(tone_percentages.keys()), | |
| list(tone_percentages.values()), | |
| color=[tone_colors[t] for t in tone_percentages]) | |
| for bar in tone_bars: | |
| h = bar.get_height() | |
| if h > 0: | |
| ax2.annotate(f'{h:.1f}%', | |
| xy=(bar.get_x() + bar.get_width() / 2, h), | |
| xytext=(0, 3), textcoords="offset points", | |
| ha='center', va='bottom') | |
| ax2.set_ylim(0, 100) | |
| ax2.set_ylabel('Percentage (%)') | |
| ax2.set_title('Tone Analysis') | |
| ax2.grid(axis='y', linestyle='--', alpha=0.7) | |
| plt.tight_layout() | |
| detailed_results = [] | |
| for emotions, (start_time, end_time) in zip(all_emotions, time_points): | |
| if not emotions: | |
| continue | |
| top = max(emotions, key=lambda x: x['score']) | |
| raw = top['label'].lower().strip() | |
| canonical = MODEL_TO_EMOTION_MAP.get(raw, raw) | |
| tone = next((t for t, el in TONE_MAPPING.items() if canonical in el), "unknown") | |
| detailed_results.append({ | |
| 'Time Range': f"{start_time:.1f}s - {end_time:.1f}s", | |
| 'Emotion': canonical, | |
| 'Tone': tone.capitalize(), | |
| 'Confidence': f"{top['score']:.2f}", | |
| 'Description': EMOTION_DESCRIPTIONS.get(canonical, ""), | |
| }) | |
| return fig, detailed_results | |
| def generate_emotion_summary(all_emotions, time_points): | |
| if not all_emotions: | |
| return "No emotional content detected." | |
| emotion_counts = {} | |
| total = len(all_emotions) | |
| for emotions in all_emotions: | |
| if not emotions: | |
| continue | |
| top = max(emotions, key=lambda x: x['score']) | |
| raw = top['label'].lower().strip() | |
| canonical = MODEL_TO_EMOTION_MAP.get(raw, raw) | |
| emotion_counts[canonical] = emotion_counts.get(canonical, 0) + 1 | |
| if not emotion_counts: | |
| return "No emotional content detected." | |
| emotion_percentages = {e: (c / total * 100) for e, c in emotion_counts.items()} | |
| dominant = max(emotion_percentages, key=lambda x: emotion_percentages[x]) | |
| summary = "### Voice Emotion Analysis Summary\n\n" | |
| summary += f"**Dominant emotion:** {dominant.capitalize()} ({emotion_percentages[dominant]:.1f}%)\n\n" | |
| summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant, '')}\n\n" | |
| summary += "**Emotion distribution:**\n" | |
| for emotion, pct in sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True): | |
| summary += f"- {emotion.capitalize()}: {pct:.1f}%\n" | |
| summary += f"\n**Interpretation:** The voice predominantly expresses {dominant} emotion" | |
| return summary | |
| # ββ Gradio handlers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def process_audio(audio_file, progress=gr.Progress()): | |
| if audio_file is None: | |
| return None, None, "No audio file provided.", None | |
| img_path, processed_audio, summary, results = analyze_audio_emotions(audio_file, progress) | |
| if img_path is None: | |
| return None, None, summary or "Failed to analyze audio emotions.", None | |
| return img_path, processed_audio, summary, results | |
| # ββ Gradio UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="Voice Emotion Analysis System") as demo: | |
| gr.Markdown(""" | |
| # ποΈ Voice Emotion Analysis System | |
| This app analyzes the emotional content of voice recordings. | |
| It detects emotions including: | |
| * π‘ **Anger** π€’ **Disgust** π¨ **Fear** π **Happiness** | |
| * π **Neutral** π’ **Sadness** π² **Surprise** | |
| And provides a detailed analysis and timeline. | |
| """) | |
| with gr.Tabs(): | |
| with gr.TabItem("Upload Audio"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_input = gr.Audio(label="Upload Audio File", type="filepath", sources=["upload"]) | |
| process_btn = gr.Button("Analyze Voice Emotions") | |
| with gr.Column(scale=2): | |
| emotion_timeline = gr.Image(label="Emotion Timeline") | |
| with gr.Row(): | |
| audio_playback = gr.Audio(label="Processed Audio") | |
| emotion_summary = gr.Markdown(label="Emotion Summary") | |
| with gr.Row(): | |
| emotion_results = gr.DataFrame( | |
| headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"], | |
| label="Detailed Emotion Analysis", | |
| ) | |
| process_btn.click( | |
| fn=process_audio, | |
| inputs=[audio_input], | |
| outputs=[emotion_timeline, audio_playback, emotion_summary, emotion_results], | |
| ) | |
| with gr.TabItem("Record Voice"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| record_input = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath") | |
| analyze_btn = gr.Button("Analyze Recording") | |
| with gr.Column(scale=2): | |
| rec_emotion_timeline = gr.Image(label="Emotion Timeline") | |
| with gr.Row(): | |
| rec_audio_playback = gr.Audio(label="Processed Audio") | |
| rec_emotion_summary = gr.Markdown(label="Emotion Summary") | |
| with gr.Row(): | |
| rec_emotion_results = gr.DataFrame( | |
| headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"], | |
| label="Detailed Emotion Analysis", | |
| ) | |
| analyze_btn.click( | |
| fn=process_audio, | |
| inputs=[record_input], | |
| outputs=[rec_emotion_timeline, rec_audio_playback, rec_emotion_summary, rec_emotion_results], | |
| ) | |
| gr.Markdown(""" | |
| ### How to Use | |
| 1. **Upload Audio Tab:** Upload an audio file and click "Analyze Voice Emotions". | |
| 2. **Record Voice Tab:** Record your voice and click "Analyze Recording". | |
| **Tips:** | |
| - Use clear recordings with minimal background noise. | |
| - Longer recordings yield more consistent results. | |
| """) | |
| def initialize_app(): | |
| print("Initializing voice emotion analysis app...") | |
| if load_emotion_model(): | |
| print("Emotion model loaded successfully!") | |
| else: | |
| print("Failed to load emotion model.") | |
| if __name__ == "__main__": | |
| initialize_app() | |
| demo.launch() |