Spaces:

Natwar
/

VoiceAnalysis

Sleeping

App Files Files Community

VoiceAnalysis / app.py

Natwar

Update app.py

18c8b56 verified 28 days ago

raw

history blame contribute delete

16.8 kB

	# voice_emotion_classification.py

	import os
	import subprocess
	import sys
	import time
	import tempfile
	import warnings
	warnings.filterwarnings("ignore")


	def run_pip(*args):
	"""Run a pip install command and raise on failure."""
	subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir"] + list(args))


	# ── Phase 1: Install packages ─────────────────────────────────────────────────
	# FIX 1: Use importlib-based checks instead of deprecated pkg_resources.
	# FIX 2: torch → CPU-only wheel (~190 MB vs ~900 MB CUDA) to avoid disk quota.
	# FIX 3: transformers pinned to 4.46.3 (last v4); v5 dropped audio-classification
	# pipeline support for many models AND is much larger on disk.
	# FIX 4: torchaudio pulled without the CUDA index so it stays CPU-only too.

	print("=== Installing gradio (if needed) ===")
	try:
	import gradio # noqa: F401
	print("gradio already installed.")
	except ImportError:
	run_pip("gradio")

	print("=== Installing torch CPU-only (if needed) ===")
	try:
	import torch # noqa: F401
	print("torch already installed.")
	except ImportError:
	run_pip("torch", "torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu")

	print("=== Installing torchaudio (if needed) ===")
	try:
	import torchaudio # noqa: F401
	print("torchaudio already installed.")
	except ImportError:
	run_pip("torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu")

	print("=== Installing transformers 4.46.3 (if needed) ===")
	try:
	import transformers as _tf
	if _tf.__version__ != "4.46.3":
	raise ImportError("wrong version")
	print("transformers 4.46.3 already installed.")
	except (ImportError, AttributeError):
	run_pip("transformers==4.46.3")

	print("=== Installing remaining packages (if needed) ===")
	for pkg in ["librosa", "scipy", "matplotlib", "pydub"]:
	try:
	__import__(pkg)
	print(f"{pkg} already installed.")
	except ImportError:
	run_pip(pkg)

	# ── Phase 2: Patch transformers get_session → requests.Session ───────────────
	# transformers 4.46.3 calls get_session().head(..., allow_redirects=, proxies=, ...)
	# In this environment get_session() returns an httpx.Client (gradio depends on
	# httpx), which rejects every requests-style kwarg.
	# Fix: replace get_session in the already-imported module namespace so it always
	# returns a plain requests.Session, which accepts all those kwargs natively.

	import transformers.utils.hub as _t_hub # noqa: E402
	import requests as _requests # noqa: E402

	_t_hub.get_session = lambda: _requests.Session()
	print("Patched transformers.utils.hub.get_session → requests.Session()")

	# ── Phase 3: Safe imports ─────────────────────────────────────────────────────

	import numpy as np
	import gradio as gr
	import torch
	import torchaudio
	import librosa
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	from pydub import AudioSegment
	import scipy
	import io
	from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
	from pathlib import Path

	# ── Emotion metadata ──────────────────────────────────────────────────────────

	EMOTION_DESCRIPTIONS = {
	"angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.",
	"disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.",
	"fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.",
	"happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.",
	"neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.",
	"sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.",
	"surprise":"Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic.",
	}

	TONE_MAPPING = {
	"positive": ["happy", "surprise"],
	"neutral": ["neutral"],
	"negative": ["angry", "sad", "fear", "disgust"],
	}

	MODEL_TO_EMOTION_MAP = {
	"hap": "happy",
	"ang": "angry",
	"sad": "sad",
	"dis": "disgust",
	"fea": "fear",
	"neu": "neutral",
	"sur": "surprise",
	}

	# ── Model loading ─────────────────────────────────────────────────────────────

	audio_emotion_classifier = None

	def load_emotion_model():
	global audio_emotion_classifier
	if audio_emotion_classifier is None:
	try:
	print("Loading emotion classification model...")
	model_name = "superb/hubert-large-superb-er"
	audio_emotion_classifier = pipeline("audio-classification", model=model_name)
	print("Emotion classification model loaded successfully")
	return True
	except Exception as e:
	print(f"Error loading emotion model: {e}")
	return False
	return True

	# ── Audio helpers ─────────────────────────────────────────────────────────────

	def convert_audio_to_wav(audio_file):
	try:
	audio = AudioSegment.from_file(audio_file)
	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
	audio.export(tmp.name, format="wav")
	return tmp.name
	except Exception as e:
	print(f"Error converting audio: {e}")
	return None

	def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=5):
	if not load_emotion_model():
	return None, None, "Failed to load emotion classification model.", None

	audio_path = audio_file if audio_file.endswith('.wav') else convert_audio_to_wav(audio_file)
	if not audio_path:
	return None, None, "Failed to process audio file. Unsupported format or corrupted file.", None

	try:
	audio_data, sample_rate = librosa.load(audio_path, sr=16000)
	chunk_samples = int(chunk_duration * sample_rate)
	num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))

	all_emotions, time_points = [], []

	for i in range(num_chunks):
	progress((i + 1) / num_chunks, "Analyzing audio emotions...")
	start_idx = i * chunk_samples
	end_idx = min(start_idx + chunk_samples, len(audio_data))
	chunk = audio_data[start_idx:end_idx]

	if len(chunk) < 0.5 * sample_rate:
	continue

	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
	chunk_path = tmp.name
	scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))

	results = audio_emotion_classifier(chunk_path)
	os.unlink(chunk_path)
	all_emotions.append(results)
	time_points.append((start_idx / sample_rate, end_idx / sample_rate))

	fig, detailed_results = generate_emotion_timeline(all_emotions, time_points, len(audio_data) / sample_rate)
	with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
	img_path = tmp.name
	fig.savefig(img_path, dpi=100, bbox_inches='tight')
	plt.close(fig)

	summary = generate_emotion_summary(all_emotions, time_points)
	return img_path, audio_path, summary, detailed_results

	except Exception as e:
	import traceback
	traceback.print_exc()
	return None, None, f"Error analyzing audio: {str(e)}", None

	# ── Visualisation & summary ───────────────────────────────────────────────────

	def generate_emotion_timeline(all_emotions, time_points, duration):
	emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
	emotion_counts = {}

	for emotions in all_emotions:
	if not emotions:
	continue
	top = max(emotions, key=lambda x: x['score'])
	raw = top['label'].lower().strip()
	canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
	emotion_counts[canonical] = emotion_counts.get(canonical, 0) + 1

	total = len(all_emotions)
	emotion_percentages = {e: (emotion_counts.get(e, 0) / total * 100) for e in emotion_labels}

	sorted_emotions = sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True)

	fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[3, 1],
	gridspec_kw={'hspace': 0.3})

	emotions_labels_disp = [item[0].capitalize() for item in sorted_emotions]
	percentages = [item[1] for item in sorted_emotions]
	colors = ['red', 'brown', 'purple', 'green', 'gray', 'blue', 'orange']
	bar_colors = (colors + ['#666666'] * max(0, len(emotions_labels_disp) - len(colors)))[:len(emotions_labels_disp)]

	bars = ax1.bar(emotions_labels_disp, percentages, color=bar_colors)
	for bar in bars:
	h = bar.get_height()
	ax1.annotate(f'{h:.1f}%',
	xy=(bar.get_x() + bar.get_width() / 2, h),
	xytext=(0, 3), textcoords="offset points",
	ha='center', va='bottom')
	ax1.set_ylim(0, 100)
	ax1.set_ylabel('Percentage (%)')
	ax1.set_title('Emotion Distribution')
	ax1.grid(axis='y', linestyle='--', alpha=0.7)

	tone_percentages = {"positive": 0, "neutral": 0, "negative": 0}
	for emotion, pct in emotion_percentages.items():
	for tone, elist in TONE_MAPPING.items():
	if emotion in elist:
	tone_percentages[tone] += pct

	tone_colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'}
	tone_bars = ax2.bar(list(tone_percentages.keys()),
	list(tone_percentages.values()),
	color=[tone_colors[t] for t in tone_percentages])
	for bar in tone_bars:
	h = bar.get_height()
	if h > 0:
	ax2.annotate(f'{h:.1f}%',
	xy=(bar.get_x() + bar.get_width() / 2, h),
	xytext=(0, 3), textcoords="offset points",
	ha='center', va='bottom')
	ax2.set_ylim(0, 100)
	ax2.set_ylabel('Percentage (%)')
	ax2.set_title('Tone Analysis')
	ax2.grid(axis='y', linestyle='--', alpha=0.7)
	plt.tight_layout()

	detailed_results = []
	for emotions, (start_time, end_time) in zip(all_emotions, time_points):
	if not emotions:
	continue
	top = max(emotions, key=lambda x: x['score'])
	raw = top['label'].lower().strip()
	canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
	tone = next((t for t, el in TONE_MAPPING.items() if canonical in el), "unknown")
	detailed_results.append({
	'Time Range': f"{start_time:.1f}s - {end_time:.1f}s",
	'Emotion': canonical,
	'Tone': tone.capitalize(),
	'Confidence': f"{top['score']:.2f}",
	'Description': EMOTION_DESCRIPTIONS.get(canonical, ""),
	})

	return fig, detailed_results

	def generate_emotion_summary(all_emotions, time_points):
	if not all_emotions:
	return "No emotional content detected."

	emotion_counts = {}
	total = len(all_emotions)
	for emotions in all_emotions:
	if not emotions:
	continue
	top = max(emotions, key=lambda x: x['score'])
	raw = top['label'].lower().strip()
	canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
	emotion_counts[canonical] = emotion_counts.get(canonical, 0) + 1

	if not emotion_counts:
	return "No emotional content detected."

	emotion_percentages = {e: (c / total * 100) for e, c in emotion_counts.items()}
	dominant = max(emotion_percentages, key=lambda x: emotion_percentages[x])

	summary = "### Voice Emotion Analysis Summary\n\n"
	summary += f"Dominant emotion: {dominant.capitalize()} ({emotion_percentages[dominant]:.1f}%)\n\n"
	summary += f"Description: {EMOTION_DESCRIPTIONS.get(dominant, '')}\n\n"
	summary += "Emotion distribution:\n"
	for emotion, pct in sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True):
	summary += f"- {emotion.capitalize()}: {pct:.1f}%\n"
	summary += f"\nInterpretation: The voice predominantly expresses {dominant} emotion"
	return summary

	# ── Gradio handlers ───────────────────────────────────────────────────────────

	def process_audio(audio_file, progress=gr.Progress()):
	if audio_file is None:
	return None, None, "No audio file provided.", None
	img_path, processed_audio, summary, results = analyze_audio_emotions(audio_file, progress)
	if img_path is None:
	return None, None, summary or "Failed to analyze audio emotions.", None
	return img_path, processed_audio, summary, results

	# ── Gradio UI ─────────────────────────────────────────────────────────────────

	with gr.Blocks(title="Voice Emotion Analysis System") as demo:
	gr.Markdown("""
	# 🎙️ Voice Emotion Analysis System

	This app analyzes the emotional content of voice recordings.

	It detects emotions including:

	* 😡 Anger   🤢 Disgust   😨 Fear   😊 Happiness
	* 😐 Neutral   😢 Sadness   😲 Surprise

	And provides a detailed analysis and timeline.
	""")

	with gr.Tabs():
	with gr.TabItem("Upload Audio"):
	with gr.Row():
	with gr.Column(scale=1):
	audio_input = gr.Audio(label="Upload Audio File", type="filepath", sources=["upload"])
	process_btn = gr.Button("Analyze Voice Emotions")
	with gr.Column(scale=2):
	emotion_timeline = gr.Image(label="Emotion Timeline")
	with gr.Row():
	audio_playback = gr.Audio(label="Processed Audio")
	emotion_summary = gr.Markdown(label="Emotion Summary")
	with gr.Row():
	emotion_results = gr.DataFrame(
	headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
	label="Detailed Emotion Analysis",
	)
	process_btn.click(
	fn=process_audio,
	inputs=[audio_input],
	outputs=[emotion_timeline, audio_playback, emotion_summary, emotion_results],
	)

	with gr.TabItem("Record Voice"):
	with gr.Row():
	with gr.Column(scale=1):
	record_input = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath")
	analyze_btn = gr.Button("Analyze Recording")
	with gr.Column(scale=2):
	rec_emotion_timeline = gr.Image(label="Emotion Timeline")
	with gr.Row():
	rec_audio_playback = gr.Audio(label="Processed Audio")
	rec_emotion_summary = gr.Markdown(label="Emotion Summary")
	with gr.Row():
	rec_emotion_results = gr.DataFrame(
	headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
	label="Detailed Emotion Analysis",
	)
	analyze_btn.click(
	fn=process_audio,
	inputs=[record_input],
	outputs=[rec_emotion_timeline, rec_audio_playback, rec_emotion_summary, rec_emotion_results],
	)

	gr.Markdown("""
	### How to Use

	1. Upload Audio Tab: Upload an audio file and click "Analyze Voice Emotions".
	2. Record Voice Tab: Record your voice and click "Analyze Recording".

	Tips:
	- Use clear recordings with minimal background noise.
	- Longer recordings yield more consistent results.
	""")


	def initialize_app():
	print("Initializing voice emotion analysis app...")
	if load_emotion_model():
	print("Emotion model loaded successfully!")
	else:
	print("Failed to load emotion model.")


	if __name__ == "__main__":
	initialize_app()
	demo.launch()