VoiceAnalysis / app.py
Natwar's picture
Update app.py
18c8b56 verified
# voice_emotion_classification.py
import os
import subprocess
import sys
import time
import tempfile
import warnings
warnings.filterwarnings("ignore")
def run_pip(*args):
"""Run a pip install command and raise on failure."""
subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir"] + list(args))
# ── Phase 1: Install packages ─────────────────────────────────────────────────
# FIX 1: Use importlib-based checks instead of deprecated pkg_resources.
# FIX 2: torch β†’ CPU-only wheel (~190 MB vs ~900 MB CUDA) to avoid disk quota.
# FIX 3: transformers pinned to 4.46.3 (last v4); v5 dropped audio-classification
# pipeline support for many models AND is much larger on disk.
# FIX 4: torchaudio pulled without the CUDA index so it stays CPU-only too.
print("=== Installing gradio (if needed) ===")
try:
import gradio # noqa: F401
print("gradio already installed.")
except ImportError:
run_pip("gradio")
print("=== Installing torch CPU-only (if needed) ===")
try:
import torch # noqa: F401
print("torch already installed.")
except ImportError:
run_pip("torch", "torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu")
print("=== Installing torchaudio (if needed) ===")
try:
import torchaudio # noqa: F401
print("torchaudio already installed.")
except ImportError:
run_pip("torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu")
print("=== Installing transformers 4.46.3 (if needed) ===")
try:
import transformers as _tf
if _tf.__version__ != "4.46.3":
raise ImportError("wrong version")
print("transformers 4.46.3 already installed.")
except (ImportError, AttributeError):
run_pip("transformers==4.46.3")
print("=== Installing remaining packages (if needed) ===")
for pkg in ["librosa", "scipy", "matplotlib", "pydub"]:
try:
__import__(pkg)
print(f"{pkg} already installed.")
except ImportError:
run_pip(pkg)
# ── Phase 2: Patch transformers get_session β†’ requests.Session ───────────────
# transformers 4.46.3 calls get_session().head(..., allow_redirects=, proxies=, ...)
# In this environment get_session() returns an httpx.Client (gradio depends on
# httpx), which rejects every requests-style kwarg.
# Fix: replace get_session in the already-imported module namespace so it always
# returns a plain requests.Session, which accepts all those kwargs natively.
import transformers.utils.hub as _t_hub # noqa: E402
import requests as _requests # noqa: E402
_t_hub.get_session = lambda: _requests.Session()
print("Patched transformers.utils.hub.get_session β†’ requests.Session()")
# ── Phase 3: Safe imports ─────────────────────────────────────────────────────
import numpy as np
import gradio as gr
import torch
import torchaudio
import librosa
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pydub import AudioSegment
import scipy
import io
from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
from pathlib import Path
# ── Emotion metadata ──────────────────────────────────────────────────────────
EMOTION_DESCRIPTIONS = {
"angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.",
"disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.",
"fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.",
"happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.",
"neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.",
"sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.",
"surprise":"Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic.",
}
TONE_MAPPING = {
"positive": ["happy", "surprise"],
"neutral": ["neutral"],
"negative": ["angry", "sad", "fear", "disgust"],
}
MODEL_TO_EMOTION_MAP = {
"hap": "happy",
"ang": "angry",
"sad": "sad",
"dis": "disgust",
"fea": "fear",
"neu": "neutral",
"sur": "surprise",
}
# ── Model loading ─────────────────────────────────────────────────────────────
audio_emotion_classifier = None
def load_emotion_model():
global audio_emotion_classifier
if audio_emotion_classifier is None:
try:
print("Loading emotion classification model...")
model_name = "superb/hubert-large-superb-er"
audio_emotion_classifier = pipeline("audio-classification", model=model_name)
print("Emotion classification model loaded successfully")
return True
except Exception as e:
print(f"Error loading emotion model: {e}")
return False
return True
# ── Audio helpers ─────────────────────────────────────────────────────────────
def convert_audio_to_wav(audio_file):
try:
audio = AudioSegment.from_file(audio_file)
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
audio.export(tmp.name, format="wav")
return tmp.name
except Exception as e:
print(f"Error converting audio: {e}")
return None
def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=5):
if not load_emotion_model():
return None, None, "Failed to load emotion classification model.", None
audio_path = audio_file if audio_file.endswith('.wav') else convert_audio_to_wav(audio_file)
if not audio_path:
return None, None, "Failed to process audio file. Unsupported format or corrupted file.", None
try:
audio_data, sample_rate = librosa.load(audio_path, sr=16000)
chunk_samples = int(chunk_duration * sample_rate)
num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))
all_emotions, time_points = [], []
for i in range(num_chunks):
progress((i + 1) / num_chunks, "Analyzing audio emotions...")
start_idx = i * chunk_samples
end_idx = min(start_idx + chunk_samples, len(audio_data))
chunk = audio_data[start_idx:end_idx]
if len(chunk) < 0.5 * sample_rate:
continue
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
chunk_path = tmp.name
scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))
results = audio_emotion_classifier(chunk_path)
os.unlink(chunk_path)
all_emotions.append(results)
time_points.append((start_idx / sample_rate, end_idx / sample_rate))
fig, detailed_results = generate_emotion_timeline(all_emotions, time_points, len(audio_data) / sample_rate)
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
img_path = tmp.name
fig.savefig(img_path, dpi=100, bbox_inches='tight')
plt.close(fig)
summary = generate_emotion_summary(all_emotions, time_points)
return img_path, audio_path, summary, detailed_results
except Exception as e:
import traceback
traceback.print_exc()
return None, None, f"Error analyzing audio: {str(e)}", None
# ── Visualisation & summary ───────────────────────────────────────────────────
def generate_emotion_timeline(all_emotions, time_points, duration):
emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
emotion_counts = {}
for emotions in all_emotions:
if not emotions:
continue
top = max(emotions, key=lambda x: x['score'])
raw = top['label'].lower().strip()
canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
emotion_counts[canonical] = emotion_counts.get(canonical, 0) + 1
total = len(all_emotions)
emotion_percentages = {e: (emotion_counts.get(e, 0) / total * 100) for e in emotion_labels}
sorted_emotions = sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True)
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[3, 1],
gridspec_kw={'hspace': 0.3})
emotions_labels_disp = [item[0].capitalize() for item in sorted_emotions]
percentages = [item[1] for item in sorted_emotions]
colors = ['red', 'brown', 'purple', 'green', 'gray', 'blue', 'orange']
bar_colors = (colors + ['#666666'] * max(0, len(emotions_labels_disp) - len(colors)))[:len(emotions_labels_disp)]
bars = ax1.bar(emotions_labels_disp, percentages, color=bar_colors)
for bar in bars:
h = bar.get_height()
ax1.annotate(f'{h:.1f}%',
xy=(bar.get_x() + bar.get_width() / 2, h),
xytext=(0, 3), textcoords="offset points",
ha='center', va='bottom')
ax1.set_ylim(0, 100)
ax1.set_ylabel('Percentage (%)')
ax1.set_title('Emotion Distribution')
ax1.grid(axis='y', linestyle='--', alpha=0.7)
tone_percentages = {"positive": 0, "neutral": 0, "negative": 0}
for emotion, pct in emotion_percentages.items():
for tone, elist in TONE_MAPPING.items():
if emotion in elist:
tone_percentages[tone] += pct
tone_colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'}
tone_bars = ax2.bar(list(tone_percentages.keys()),
list(tone_percentages.values()),
color=[tone_colors[t] for t in tone_percentages])
for bar in tone_bars:
h = bar.get_height()
if h > 0:
ax2.annotate(f'{h:.1f}%',
xy=(bar.get_x() + bar.get_width() / 2, h),
xytext=(0, 3), textcoords="offset points",
ha='center', va='bottom')
ax2.set_ylim(0, 100)
ax2.set_ylabel('Percentage (%)')
ax2.set_title('Tone Analysis')
ax2.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
detailed_results = []
for emotions, (start_time, end_time) in zip(all_emotions, time_points):
if not emotions:
continue
top = max(emotions, key=lambda x: x['score'])
raw = top['label'].lower().strip()
canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
tone = next((t for t, el in TONE_MAPPING.items() if canonical in el), "unknown")
detailed_results.append({
'Time Range': f"{start_time:.1f}s - {end_time:.1f}s",
'Emotion': canonical,
'Tone': tone.capitalize(),
'Confidence': f"{top['score']:.2f}",
'Description': EMOTION_DESCRIPTIONS.get(canonical, ""),
})
return fig, detailed_results
def generate_emotion_summary(all_emotions, time_points):
if not all_emotions:
return "No emotional content detected."
emotion_counts = {}
total = len(all_emotions)
for emotions in all_emotions:
if not emotions:
continue
top = max(emotions, key=lambda x: x['score'])
raw = top['label'].lower().strip()
canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
emotion_counts[canonical] = emotion_counts.get(canonical, 0) + 1
if not emotion_counts:
return "No emotional content detected."
emotion_percentages = {e: (c / total * 100) for e, c in emotion_counts.items()}
dominant = max(emotion_percentages, key=lambda x: emotion_percentages[x])
summary = "### Voice Emotion Analysis Summary\n\n"
summary += f"**Dominant emotion:** {dominant.capitalize()} ({emotion_percentages[dominant]:.1f}%)\n\n"
summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant, '')}\n\n"
summary += "**Emotion distribution:**\n"
for emotion, pct in sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True):
summary += f"- {emotion.capitalize()}: {pct:.1f}%\n"
summary += f"\n**Interpretation:** The voice predominantly expresses {dominant} emotion"
return summary
# ── Gradio handlers ───────────────────────────────────────────────────────────
def process_audio(audio_file, progress=gr.Progress()):
if audio_file is None:
return None, None, "No audio file provided.", None
img_path, processed_audio, summary, results = analyze_audio_emotions(audio_file, progress)
if img_path is None:
return None, None, summary or "Failed to analyze audio emotions.", None
return img_path, processed_audio, summary, results
# ── Gradio UI ─────────────────────────────────────────────────────────────────
with gr.Blocks(title="Voice Emotion Analysis System") as demo:
gr.Markdown("""
# πŸŽ™οΈ Voice Emotion Analysis System
This app analyzes the emotional content of voice recordings.
It detects emotions including:
* 😑 **Anger** &nbsp; 🀒 **Disgust** &nbsp; 😨 **Fear** &nbsp; 😊 **Happiness**
* 😐 **Neutral** &nbsp; 😒 **Sadness** &nbsp; 😲 **Surprise**
And provides a detailed analysis and timeline.
""")
with gr.Tabs():
with gr.TabItem("Upload Audio"):
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(label="Upload Audio File", type="filepath", sources=["upload"])
process_btn = gr.Button("Analyze Voice Emotions")
with gr.Column(scale=2):
emotion_timeline = gr.Image(label="Emotion Timeline")
with gr.Row():
audio_playback = gr.Audio(label="Processed Audio")
emotion_summary = gr.Markdown(label="Emotion Summary")
with gr.Row():
emotion_results = gr.DataFrame(
headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
label="Detailed Emotion Analysis",
)
process_btn.click(
fn=process_audio,
inputs=[audio_input],
outputs=[emotion_timeline, audio_playback, emotion_summary, emotion_results],
)
with gr.TabItem("Record Voice"):
with gr.Row():
with gr.Column(scale=1):
record_input = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath")
analyze_btn = gr.Button("Analyze Recording")
with gr.Column(scale=2):
rec_emotion_timeline = gr.Image(label="Emotion Timeline")
with gr.Row():
rec_audio_playback = gr.Audio(label="Processed Audio")
rec_emotion_summary = gr.Markdown(label="Emotion Summary")
with gr.Row():
rec_emotion_results = gr.DataFrame(
headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
label="Detailed Emotion Analysis",
)
analyze_btn.click(
fn=process_audio,
inputs=[record_input],
outputs=[rec_emotion_timeline, rec_audio_playback, rec_emotion_summary, rec_emotion_results],
)
gr.Markdown("""
### How to Use
1. **Upload Audio Tab:** Upload an audio file and click "Analyze Voice Emotions".
2. **Record Voice Tab:** Record your voice and click "Analyze Recording".
**Tips:**
- Use clear recordings with minimal background noise.
- Longer recordings yield more consistent results.
""")
def initialize_app():
print("Initializing voice emotion analysis app...")
if load_emotion_model():
print("Emotion model loaded successfully!")
else:
print("Failed to load emotion model.")
if __name__ == "__main__":
initialize_app()
demo.launch()