Natwar commited on
Commit
18c8b56
Β·
verified Β·
1 Parent(s): e89b55b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -270
app.py CHANGED
@@ -3,75 +3,109 @@
3
  import os
4
  import subprocess
5
  import sys
6
- import pkg_resources
7
  import time
8
  import tempfile
9
- import numpy as np
10
  import warnings
11
- from pathlib import Path
12
  warnings.filterwarnings("ignore")
13
 
14
- def install_package(package, version=None):
15
- package_spec = f"{package}=={version}" if version else package
16
- print(f"Installing {package_spec}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  try:
18
- subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", package_spec])
19
- except subprocess.CalledProcessError as e:
20
- print(f"Failed to install {package_spec}: {e}")
21
- raise
22
-
23
- # Required packages (you may add version pins if necessary)
24
- required_packages = {
25
- "gradio": None,
26
- "torch": None,
27
- "torchaudio": None,
28
- "transformers": None,
29
- "librosa": None,
30
- "scipy": None,
31
- "matplotlib": None,
32
- "pydub": None
33
- }
34
 
35
- installed_packages = {pkg.key for pkg in pkg_resources.working_set}
36
- for package, version in required_packages.items():
37
- if package not in installed_packages:
38
- install_package(package, version)
39
 
40
- # Now import all necessary packages
 
 
 
 
 
41
  import gradio as gr
42
  import torch
43
  import torchaudio
44
  import librosa
 
 
45
  import matplotlib.pyplot as plt
46
- from matplotlib.colors import LinearSegmentedColormap
47
  from pydub import AudioSegment
48
  import scipy
49
  import io
50
  from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
51
  from pathlib import Path
52
- import matplotlib
53
- matplotlib.use('Agg') # Use non-interactive backend
54
 
55
- # Define emotion labels, tone mapping, and descriptions
 
56
  EMOTION_DESCRIPTIONS = {
57
- "angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.",
58
  "disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.",
59
- "fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.",
60
- "happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.",
61
  "neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.",
62
- "sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.",
63
- "surprise": "Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic."
64
  }
65
 
66
- # Here we map emotion to a generalized tone (for example, negative or positive)
67
  TONE_MAPPING = {
68
  "positive": ["happy", "surprise"],
69
- "neutral": ["neutral"],
70
- "negative": ["angry", "sad", "fear", "disgust"]
71
  }
72
 
73
- # Some Hugging Face models return short labels (e.g., "hap", "ang", etc.).
74
- # This mapping will ensure they're translated into our full canonical labels.
75
  MODEL_TO_EMOTION_MAP = {
76
  "hap": "happy",
77
  "ang": "angry",
@@ -79,19 +113,18 @@ MODEL_TO_EMOTION_MAP = {
79
  "dis": "disgust",
80
  "fea": "fear",
81
  "neu": "neutral",
82
- "sur": "surprise"
83
  }
84
 
85
- # Global variable for the emotion classifier
 
86
  audio_emotion_classifier = None
87
 
88
  def load_emotion_model():
89
- """Load the emotion classification model once and cache it."""
90
  global audio_emotion_classifier
91
  if audio_emotion_classifier is None:
92
  try:
93
  print("Loading emotion classification model...")
94
- # Using the Hugging Face pipeline with the new model that classifies speech emotion
95
  model_name = "superb/hubert-large-superb-er"
96
  audio_emotion_classifier = pipeline("audio-classification", model=model_name)
97
  print("Emotion classification model loaded successfully")
@@ -101,359 +134,255 @@ def load_emotion_model():
101
  return False
102
  return True
103
 
 
 
104
  def convert_audio_to_wav(audio_file):
105
- """Convert the uploaded audio to WAV format."""
106
  try:
107
  audio = AudioSegment.from_file(audio_file)
108
- with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
109
- wav_path = temp_wav.name
110
- audio.export(wav_path, format="wav")
111
- return wav_path
112
  except Exception as e:
113
  print(f"Error converting audio: {e}")
114
  return None
115
 
116
  def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=5):
117
- """
118
- Analyze emotions in an audio file by processing it in chunks.
119
- Returns a visualization, processed audio path, summary, and detailed results.
120
- """
121
  if not load_emotion_model():
122
- return None, "Failed to load emotion classification model. Please check console for details."
123
-
124
- # If the file is already a WAV, use it directly; else convert it.
125
- if audio_file.endswith('.wav'):
126
- audio_path = audio_file
127
- else:
128
- audio_path = convert_audio_to_wav(audio_file)
129
- if not audio_path:
130
- return None, "Failed to process audio file. Unsupported format or corrupted file."
131
-
132
  try:
133
- # Load the audio using librosa
134
  audio_data, sample_rate = librosa.load(audio_path, sr=16000)
135
- duration = len(audio_data) / sample_rate
136
-
137
- # Process in chunks for long files
138
  chunk_samples = int(chunk_duration * sample_rate)
139
  num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))
140
-
141
- all_emotions = []
142
- time_points = []
143
-
144
  for i in range(num_chunks):
145
  progress((i + 1) / num_chunks, "Analyzing audio emotions...")
146
  start_idx = i * chunk_samples
147
  end_idx = min(start_idx + chunk_samples, len(audio_data))
148
  chunk = audio_data[start_idx:end_idx]
149
-
150
- # Skip too-short chunks (<0.5 seconds)
151
  if len(chunk) < 0.5 * sample_rate:
152
  continue
153
-
154
- # Create a temporary file for this audio chunk
155
- with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_chunk:
156
- chunk_path = temp_chunk.name
157
  scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))
158
-
159
- # Get emotion classification results on this chunk
160
  results = audio_emotion_classifier(chunk_path)
161
- os.unlink(chunk_path) # Remove the temporary file
162
-
163
  all_emotions.append(results)
164
  time_points.append((start_idx / sample_rate, end_idx / sample_rate))
165
-
166
- # Generate visualization and summary
167
- fig, detailed_results = generate_emotion_timeline(all_emotions, time_points, duration)
168
- with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
169
- img_path = temp_img.name
170
  fig.savefig(img_path, dpi=100, bbox_inches='tight')
171
  plt.close(fig)
172
-
173
  summary = generate_emotion_summary(all_emotions, time_points)
174
  return img_path, audio_path, summary, detailed_results
175
-
176
  except Exception as e:
177
- print(f"Error analyzing audio: {e}")
178
  import traceback
179
  traceback.print_exc()
180
  return None, None, f"Error analyzing audio: {str(e)}", None
181
 
 
 
182
  def generate_emotion_timeline(all_emotions, time_points, duration):
183
- """
184
- Generate a bar chart visualization of emotion percentages with tone analysis.
185
- Returns the matplotlib figure and a list of detailed results.
186
- """
187
- # All possible emotion labels from our dictionary
188
  emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
189
-
190
- # We'll accumulate counts based on our canonical labels (e.g., "happy", "angry").
191
  emotion_counts = {}
192
-
193
  for emotions in all_emotions:
194
  if not emotions:
195
  continue
196
-
197
- # The pipeline returns items like {"label": "Hap", "score": 0.95}, etc.
198
- top_emotion = max(emotions, key=lambda x: x['score'])
199
-
200
- # Normalize the label from the model to a canonical label used in EMOTION_DESCRIPTIONS
201
- raw_label = top_emotion['label'].lower().strip() # e.g., "hap", "ang", ...
202
- canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label)
203
- # If there's no mapping, we leave it as raw_label.
204
- # But typically, it should be one of "happy", "angry", "disgust", "fear", "sad", "neutral", "surprise".
205
-
206
- # Count how many times each canonical label appears
207
- emotion_counts[canonical_label] = emotion_counts.get(canonical_label, 0) + 1
208
-
209
- total_chunks = len(all_emotions)
210
- emotion_percentages = {
211
- e: (count / total_chunks * 100) for e, count in emotion_counts.items()
212
- }
213
-
214
- # Create empty percentages for emotions that didn't appear
215
- for label in emotion_labels:
216
- if label not in emotion_percentages:
217
- emotion_percentages[label] = 0.0
218
-
219
- # Sort emotions by percentage
220
  sorted_emotions = sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True)
221
-
222
- # Create the bar chart with subplots: one for emotions and one for tone
223
- fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[3, 1], gridspec_kw={'hspace': 0.3})
224
-
225
- # Capitalize each label for a nice display
226
- emotions = [item[0].capitalize() for item in sorted_emotions]
227
  percentages = [item[1] for item in sorted_emotions]
228
-
229
- # Custom colors for emotions (enough for 7 emotions)
230
  colors = ['red', 'brown', 'purple', 'green', 'gray', 'blue', 'orange']
231
- if len(emotions) <= len(colors):
232
- bar_colors = colors[:len(emotions)]
233
- else:
234
- # fallback if there's more emotions than colors
235
- bar_colors = colors + ['#666666'] * (len(emotions) - len(colors))
236
-
237
- # Plot emotion bars
238
- bars = ax1.bar(emotions, percentages, color=bar_colors)
239
-
240
- # Add percentage labels on top of each bar
241
  for bar in bars:
242
- height = bar.get_height()
243
- ax1.annotate(f'{height:.1f}%',
244
- xy=(bar.get_x() + bar.get_width() / 2, height),
245
- xytext=(0, 3), # 3 points vertical offset
246
- textcoords="offset points",
247
  ha='center', va='bottom')
248
-
249
- ax1.set_ylim(0, 100) # Fixed 100% scale
250
  ax1.set_ylabel('Percentage (%)')
251
  ax1.set_title('Emotion Distribution')
252
  ax1.grid(axis='y', linestyle='--', alpha=0.7)
253
-
254
- # Calculate tone percentages based on the canonical labels we found
255
  tone_percentages = {"positive": 0, "neutral": 0, "negative": 0}
256
-
257
- for emotion_label, percentage in emotion_percentages.items():
258
- for tone, emotions_list in TONE_MAPPING.items():
259
- if emotion_label in emotions_list:
260
- tone_percentages[tone] += percentage
261
-
262
- # Plot tone bars
263
- tones = list(tone_percentages.keys())
264
- tone_values = list(tone_percentages.values())
265
  tone_colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'}
266
- tone_bars = ax2.bar(tones, tone_values, color=[tone_colors[t] for t in tones])
267
-
268
- # Add percentage labels on tone bars
269
  for bar in tone_bars:
270
- height = bar.get_height()
271
- if height > 0: # Only add label if there's a visible bar
272
- ax2.annotate(f'{height:.1f}%',
273
- xy=(bar.get_x() + bar.get_width() / 2, height),
274
- xytext=(0, 3),
275
- textcoords="offset points",
276
  ha='center', va='bottom')
277
-
278
  ax2.set_ylim(0, 100)
279
  ax2.set_ylabel('Percentage (%)')
280
  ax2.set_title('Tone Analysis')
281
  ax2.grid(axis='y', linestyle='--', alpha=0.7)
282
-
283
  plt.tight_layout()
284
-
285
- # Generate a more detailed time-segmented result
286
  detailed_results = []
287
- for idx, (emotions, (start_time, end_time)) in enumerate(zip(all_emotions, time_points)):
288
  if not emotions:
289
  continue
290
-
291
- top_emotion = max(emotions, key=lambda x: x['score'])
292
- raw_label = top_emotion['label'].lower().strip()
293
- canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label)
294
-
295
- # Determine the tone for this emotion
296
- # (based on canonical_label rather than the raw model label)
297
- tone = next((t for t, e_list in TONE_MAPPING.items() if canonical_label in e_list), "unknown")
298
-
299
  detailed_results.append({
300
- 'Time Range': f"{start_time:.1f}s - {end_time:.1f}s",
301
- 'Emotion': canonical_label,
302
- 'Tone': tone.capitalize(),
303
- 'Confidence': f"{top_emotion['score']:.2f}",
304
- 'Description': EMOTION_DESCRIPTIONS.get(canonical_label, "")
305
  })
306
-
307
  return fig, detailed_results
308
 
309
  def generate_emotion_summary(all_emotions, time_points):
310
- """
311
- Create a summary text from the emotion analysis.
312
- Counts occurrences and computes percentages of the dominant emotion.
313
- """
314
  if not all_emotions:
315
  return "No emotional content detected."
316
-
317
  emotion_counts = {}
318
- total_chunks = len(all_emotions)
319
-
320
  for emotions in all_emotions:
321
  if not emotions:
322
  continue
323
- top_emotion = max(emotions, key=lambda x: x['score'])
324
-
325
- # Normalize the label
326
- raw_label = top_emotion['label'].lower().strip()
327
- canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label)
328
-
329
- emotion_counts[canonical_label] = emotion_counts.get(canonical_label, 0) + 1
330
-
331
- emotion_percentages = {
332
- e: (count / total_chunks * 100)
333
- for e, count in emotion_counts.items()
334
- }
335
-
336
- if not emotion_percentages:
337
  return "No emotional content detected."
338
-
339
- # Find the dominant emotion (highest percentage)
340
- dominant_emotion = max(emotion_percentages.items(), key=lambda x: x[1])[0]
341
-
342
- summary = f"### Voice Emotion Analysis Summary\n\n"
343
- summary += f"**Dominant emotion:** {dominant_emotion.capitalize()} ({emotion_percentages[dominant_emotion]:.1f}%)\n\n"
344
- summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant_emotion, '')}\n\n"
345
  summary += "**Emotion distribution:**\n"
346
-
347
- for emotion, percentage in sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True):
348
- summary += f"- {emotion.capitalize()}: {percentage:.1f}%\n"
349
-
350
- summary += "\n**Interpretation:** The voice predominantly expresses {0} emotion".format(dominant_emotion)
351
  return summary
352
 
353
- def record_audio(audio):
354
- """Save recorded audio and analyze emotions."""
355
- try:
356
- with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
357
- audio_path = temp_file.name
358
- with open(audio_path, 'wb') as f:
359
- f.write(audio)
360
- return audio_path
361
- except Exception as e:
362
- print(f"Error saving recorded audio: {e}")
363
- return None
364
 
365
  def process_audio(audio_file, progress=gr.Progress()):
366
- """Process the audio file and analyze emotions."""
367
  if audio_file is None:
368
  return None, None, "No audio file provided.", None
369
-
370
  img_path, processed_audio, summary, results = analyze_audio_emotions(audio_file, progress)
371
  if img_path is None:
372
- return None, None, "Failed to analyze audio emotions.", None
373
  return img_path, processed_audio, summary, results
374
 
375
- # Create Gradio interface
 
376
  with gr.Blocks(title="Voice Emotion Analysis System") as demo:
377
  gr.Markdown("""
378
  # πŸŽ™οΈ Voice Emotion Analysis System
379
-
380
  This app analyzes the emotional content of voice recordings.
381
-
382
  It detects emotions including:
383
-
384
- * 😑 **Anger**
385
- * 🀒 **Disgust**
386
- * 😨 **Fear**
387
- * 😊 **Happiness**
388
- * 😐 **Neutral**
389
- * 😒 **Sadness**
390
- * 😲 **Surprise**
391
-
392
  And provides a detailed analysis and timeline.
393
  """)
394
-
395
  with gr.Tabs():
396
  with gr.TabItem("Upload Audio"):
397
  with gr.Row():
398
  with gr.Column(scale=1):
399
- audio_input = gr.Audio(
400
- label="Upload Audio File",
401
- type="filepath",
402
- sources=["upload"]
403
- )
404
  process_btn = gr.Button("Analyze Voice Emotions")
405
  with gr.Column(scale=2):
406
- emotion_timeline = gr.Image(label="Emotion Timeline", show_label=True)
407
  with gr.Row():
408
- audio_playback = gr.Audio(label="Processed Audio", show_label=True)
409
  emotion_summary = gr.Markdown(label="Emotion Summary")
410
  with gr.Row():
411
  emotion_results = gr.DataFrame(
412
  headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
413
- label="Detailed Emotion Analysis"
414
  )
415
  process_btn.click(
416
  fn=process_audio,
417
  inputs=[audio_input],
418
- outputs=[emotion_timeline, audio_playback, emotion_summary, emotion_results]
419
  )
420
-
421
  with gr.TabItem("Record Voice"):
422
  with gr.Row():
423
  with gr.Column(scale=1):
424
- record_input = gr.Audio(
425
- label="Record Your Voice",
426
- sources=["microphone"],
427
- type="filepath"
428
- )
429
  analyze_btn = gr.Button("Analyze Recording")
430
  with gr.Column(scale=2):
431
- rec_emotion_timeline = gr.Image(label="Emotion Timeline", show_label=True)
432
  with gr.Row():
433
- rec_audio_playback = gr.Audio(label="Processed Audio", show_label=True)
434
  rec_emotion_summary = gr.Markdown(label="Emotion Summary")
435
  with gr.Row():
436
  rec_emotion_results = gr.DataFrame(
437
  headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
438
- label="Detailed Emotion Analysis"
439
  )
440
  analyze_btn.click(
441
  fn=process_audio,
442
  inputs=[record_input],
443
- outputs=[rec_emotion_timeline, rec_audio_playback, rec_emotion_summary, rec_emotion_results]
444
  )
445
-
446
  gr.Markdown("""
447
  ### How to Use
448
-
449
  1. **Upload Audio Tab:** Upload an audio file and click "Analyze Voice Emotions".
450
  2. **Record Voice Tab:** Record your voice and click "Analyze Recording".
451
-
452
  **Tips:**
453
  - Use clear recordings with minimal background noise.
454
  - Longer recordings yield more consistent results.
455
  """)
456
 
 
457
  def initialize_app():
458
  print("Initializing voice emotion analysis app...")
459
  if load_emotion_model():
@@ -461,6 +390,7 @@ def initialize_app():
461
  else:
462
  print("Failed to load emotion model.")
463
 
 
464
  if __name__ == "__main__":
465
  initialize_app()
466
- demo.launch()
 
3
  import os
4
  import subprocess
5
  import sys
 
6
  import time
7
  import tempfile
 
8
  import warnings
 
9
  warnings.filterwarnings("ignore")
10
 
11
+
12
+ def run_pip(*args):
13
+ """Run a pip install command and raise on failure."""
14
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir"] + list(args))
15
+
16
+
17
+ # ── Phase 1: Install packages ─────────────────────────────────────────────────
18
+ # FIX 1: Use importlib-based checks instead of deprecated pkg_resources.
19
+ # FIX 2: torch β†’ CPU-only wheel (~190 MB vs ~900 MB CUDA) to avoid disk quota.
20
+ # FIX 3: transformers pinned to 4.46.3 (last v4); v5 dropped audio-classification
21
+ # pipeline support for many models AND is much larger on disk.
22
+ # FIX 4: torchaudio pulled without the CUDA index so it stays CPU-only too.
23
+
24
+ print("=== Installing gradio (if needed) ===")
25
+ try:
26
+ import gradio # noqa: F401
27
+ print("gradio already installed.")
28
+ except ImportError:
29
+ run_pip("gradio")
30
+
31
+ print("=== Installing torch CPU-only (if needed) ===")
32
+ try:
33
+ import torch # noqa: F401
34
+ print("torch already installed.")
35
+ except ImportError:
36
+ run_pip("torch", "torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu")
37
+
38
+ print("=== Installing torchaudio (if needed) ===")
39
+ try:
40
+ import torchaudio # noqa: F401
41
+ print("torchaudio already installed.")
42
+ except ImportError:
43
+ run_pip("torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu")
44
+
45
+ print("=== Installing transformers 4.46.3 (if needed) ===")
46
+ try:
47
+ import transformers as _tf
48
+ if _tf.__version__ != "4.46.3":
49
+ raise ImportError("wrong version")
50
+ print("transformers 4.46.3 already installed.")
51
+ except (ImportError, AttributeError):
52
+ run_pip("transformers==4.46.3")
53
+
54
+ print("=== Installing remaining packages (if needed) ===")
55
+ for pkg in ["librosa", "scipy", "matplotlib", "pydub"]:
56
  try:
57
+ __import__(pkg)
58
+ print(f"{pkg} already installed.")
59
+ except ImportError:
60
+ run_pip(pkg)
61
+
62
+ # ── Phase 2: Patch transformers get_session β†’ requests.Session ───────────────
63
+ # transformers 4.46.3 calls get_session().head(..., allow_redirects=, proxies=, ...)
64
+ # In this environment get_session() returns an httpx.Client (gradio depends on
65
+ # httpx), which rejects every requests-style kwarg.
66
+ # Fix: replace get_session in the already-imported module namespace so it always
67
+ # returns a plain requests.Session, which accepts all those kwargs natively.
 
 
 
 
 
68
 
69
+ import transformers.utils.hub as _t_hub # noqa: E402
70
+ import requests as _requests # noqa: E402
 
 
71
 
72
+ _t_hub.get_session = lambda: _requests.Session()
73
+ print("Patched transformers.utils.hub.get_session β†’ requests.Session()")
74
+
75
+ # ── Phase 3: Safe imports ─────────────────────────────────────────────────────
76
+
77
+ import numpy as np
78
  import gradio as gr
79
  import torch
80
  import torchaudio
81
  import librosa
82
+ import matplotlib
83
+ matplotlib.use('Agg')
84
  import matplotlib.pyplot as plt
 
85
  from pydub import AudioSegment
86
  import scipy
87
  import io
88
  from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
89
  from pathlib import Path
 
 
90
 
91
+ # ── Emotion metadata ──────────────────────────────────────────────────────────
92
+
93
  EMOTION_DESCRIPTIONS = {
94
+ "angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.",
95
  "disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.",
96
+ "fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.",
97
+ "happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.",
98
  "neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.",
99
+ "sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.",
100
+ "surprise":"Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic.",
101
  }
102
 
 
103
  TONE_MAPPING = {
104
  "positive": ["happy", "surprise"],
105
+ "neutral": ["neutral"],
106
+ "negative": ["angry", "sad", "fear", "disgust"],
107
  }
108
 
 
 
109
  MODEL_TO_EMOTION_MAP = {
110
  "hap": "happy",
111
  "ang": "angry",
 
113
  "dis": "disgust",
114
  "fea": "fear",
115
  "neu": "neutral",
116
+ "sur": "surprise",
117
  }
118
 
119
+ # ── Model loading ─────────────────────────────────────────────────────────────
120
+
121
  audio_emotion_classifier = None
122
 
123
  def load_emotion_model():
 
124
  global audio_emotion_classifier
125
  if audio_emotion_classifier is None:
126
  try:
127
  print("Loading emotion classification model...")
 
128
  model_name = "superb/hubert-large-superb-er"
129
  audio_emotion_classifier = pipeline("audio-classification", model=model_name)
130
  print("Emotion classification model loaded successfully")
 
134
  return False
135
  return True
136
 
137
+ # ── Audio helpers ─────────────────────────────────────────────────────────────
138
+
139
  def convert_audio_to_wav(audio_file):
 
140
  try:
141
  audio = AudioSegment.from_file(audio_file)
142
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
143
+ audio.export(tmp.name, format="wav")
144
+ return tmp.name
 
145
  except Exception as e:
146
  print(f"Error converting audio: {e}")
147
  return None
148
 
149
  def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=5):
 
 
 
 
150
  if not load_emotion_model():
151
+ return None, None, "Failed to load emotion classification model.", None
152
+
153
+ audio_path = audio_file if audio_file.endswith('.wav') else convert_audio_to_wav(audio_file)
154
+ if not audio_path:
155
+ return None, None, "Failed to process audio file. Unsupported format or corrupted file.", None
156
+
 
 
 
 
157
  try:
 
158
  audio_data, sample_rate = librosa.load(audio_path, sr=16000)
 
 
 
159
  chunk_samples = int(chunk_duration * sample_rate)
160
  num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))
161
+
162
+ all_emotions, time_points = [], []
163
+
 
164
  for i in range(num_chunks):
165
  progress((i + 1) / num_chunks, "Analyzing audio emotions...")
166
  start_idx = i * chunk_samples
167
  end_idx = min(start_idx + chunk_samples, len(audio_data))
168
  chunk = audio_data[start_idx:end_idx]
169
+
 
170
  if len(chunk) < 0.5 * sample_rate:
171
  continue
172
+
173
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
174
+ chunk_path = tmp.name
 
175
  scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))
176
+
 
177
  results = audio_emotion_classifier(chunk_path)
178
+ os.unlink(chunk_path)
 
179
  all_emotions.append(results)
180
  time_points.append((start_idx / sample_rate, end_idx / sample_rate))
181
+
182
+ fig, detailed_results = generate_emotion_timeline(all_emotions, time_points, len(audio_data) / sample_rate)
183
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
184
+ img_path = tmp.name
 
185
  fig.savefig(img_path, dpi=100, bbox_inches='tight')
186
  plt.close(fig)
187
+
188
  summary = generate_emotion_summary(all_emotions, time_points)
189
  return img_path, audio_path, summary, detailed_results
190
+
191
  except Exception as e:
 
192
  import traceback
193
  traceback.print_exc()
194
  return None, None, f"Error analyzing audio: {str(e)}", None
195
 
196
+ # ── Visualisation & summary ───────────────────────────────────────────────────
197
+
198
  def generate_emotion_timeline(all_emotions, time_points, duration):
 
 
 
 
 
199
  emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
 
 
200
  emotion_counts = {}
201
+
202
  for emotions in all_emotions:
203
  if not emotions:
204
  continue
205
+ top = max(emotions, key=lambda x: x['score'])
206
+ raw = top['label'].lower().strip()
207
+ canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
208
+ emotion_counts[canonical] = emotion_counts.get(canonical, 0) + 1
209
+
210
+ total = len(all_emotions)
211
+ emotion_percentages = {e: (emotion_counts.get(e, 0) / total * 100) for e in emotion_labels}
212
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  sorted_emotions = sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True)
214
+
215
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[3, 1],
216
+ gridspec_kw={'hspace': 0.3})
217
+
218
+ emotions_labels_disp = [item[0].capitalize() for item in sorted_emotions]
 
219
  percentages = [item[1] for item in sorted_emotions]
 
 
220
  colors = ['red', 'brown', 'purple', 'green', 'gray', 'blue', 'orange']
221
+ bar_colors = (colors + ['#666666'] * max(0, len(emotions_labels_disp) - len(colors)))[:len(emotions_labels_disp)]
222
+
223
+ bars = ax1.bar(emotions_labels_disp, percentages, color=bar_colors)
 
 
 
 
 
 
 
224
  for bar in bars:
225
+ h = bar.get_height()
226
+ ax1.annotate(f'{h:.1f}%',
227
+ xy=(bar.get_x() + bar.get_width() / 2, h),
228
+ xytext=(0, 3), textcoords="offset points",
 
229
  ha='center', va='bottom')
230
+ ax1.set_ylim(0, 100)
 
231
  ax1.set_ylabel('Percentage (%)')
232
  ax1.set_title('Emotion Distribution')
233
  ax1.grid(axis='y', linestyle='--', alpha=0.7)
234
+
 
235
  tone_percentages = {"positive": 0, "neutral": 0, "negative": 0}
236
+ for emotion, pct in emotion_percentages.items():
237
+ for tone, elist in TONE_MAPPING.items():
238
+ if emotion in elist:
239
+ tone_percentages[tone] += pct
240
+
 
 
 
 
241
  tone_colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'}
242
+ tone_bars = ax2.bar(list(tone_percentages.keys()),
243
+ list(tone_percentages.values()),
244
+ color=[tone_colors[t] for t in tone_percentages])
245
  for bar in tone_bars:
246
+ h = bar.get_height()
247
+ if h > 0:
248
+ ax2.annotate(f'{h:.1f}%',
249
+ xy=(bar.get_x() + bar.get_width() / 2, h),
250
+ xytext=(0, 3), textcoords="offset points",
 
251
  ha='center', va='bottom')
 
252
  ax2.set_ylim(0, 100)
253
  ax2.set_ylabel('Percentage (%)')
254
  ax2.set_title('Tone Analysis')
255
  ax2.grid(axis='y', linestyle='--', alpha=0.7)
 
256
  plt.tight_layout()
257
+
 
258
  detailed_results = []
259
+ for emotions, (start_time, end_time) in zip(all_emotions, time_points):
260
  if not emotions:
261
  continue
262
+ top = max(emotions, key=lambda x: x['score'])
263
+ raw = top['label'].lower().strip()
264
+ canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
265
+ tone = next((t for t, el in TONE_MAPPING.items() if canonical in el), "unknown")
 
 
 
 
 
266
  detailed_results.append({
267
+ 'Time Range': f"{start_time:.1f}s - {end_time:.1f}s",
268
+ 'Emotion': canonical,
269
+ 'Tone': tone.capitalize(),
270
+ 'Confidence': f"{top['score']:.2f}",
271
+ 'Description': EMOTION_DESCRIPTIONS.get(canonical, ""),
272
  })
273
+
274
  return fig, detailed_results
275
 
276
  def generate_emotion_summary(all_emotions, time_points):
 
 
 
 
277
  if not all_emotions:
278
  return "No emotional content detected."
279
+
280
  emotion_counts = {}
281
+ total = len(all_emotions)
 
282
  for emotions in all_emotions:
283
  if not emotions:
284
  continue
285
+ top = max(emotions, key=lambda x: x['score'])
286
+ raw = top['label'].lower().strip()
287
+ canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
288
+ emotion_counts[canonical] = emotion_counts.get(canonical, 0) + 1
289
+
290
+ if not emotion_counts:
 
 
 
 
 
 
 
 
291
  return "No emotional content detected."
292
+
293
+ emotion_percentages = {e: (c / total * 100) for e, c in emotion_counts.items()}
294
+ dominant = max(emotion_percentages, key=lambda x: emotion_percentages[x])
295
+
296
+ summary = "### Voice Emotion Analysis Summary\n\n"
297
+ summary += f"**Dominant emotion:** {dominant.capitalize()} ({emotion_percentages[dominant]:.1f}%)\n\n"
298
+ summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant, '')}\n\n"
299
  summary += "**Emotion distribution:**\n"
300
+ for emotion, pct in sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True):
301
+ summary += f"- {emotion.capitalize()}: {pct:.1f}%\n"
302
+ summary += f"\n**Interpretation:** The voice predominantly expresses {dominant} emotion"
 
 
303
  return summary
304
 
305
+ # ── Gradio handlers ───────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
306
 
307
  def process_audio(audio_file, progress=gr.Progress()):
 
308
  if audio_file is None:
309
  return None, None, "No audio file provided.", None
 
310
  img_path, processed_audio, summary, results = analyze_audio_emotions(audio_file, progress)
311
  if img_path is None:
312
+ return None, None, summary or "Failed to analyze audio emotions.", None
313
  return img_path, processed_audio, summary, results
314
 
315
+ # ── Gradio UI ─────────────────────────────────────────────────────────────────
316
+
317
  with gr.Blocks(title="Voice Emotion Analysis System") as demo:
318
  gr.Markdown("""
319
  # πŸŽ™οΈ Voice Emotion Analysis System
320
+
321
  This app analyzes the emotional content of voice recordings.
322
+
323
  It detects emotions including:
324
+
325
+ * 😑 **Anger** &nbsp; 🀒 **Disgust** &nbsp; 😨 **Fear** &nbsp; 😊 **Happiness**
326
+ * 😐 **Neutral** &nbsp; 😒 **Sadness** &nbsp; 😲 **Surprise**
327
+
 
 
 
 
 
328
  And provides a detailed analysis and timeline.
329
  """)
330
+
331
  with gr.Tabs():
332
  with gr.TabItem("Upload Audio"):
333
  with gr.Row():
334
  with gr.Column(scale=1):
335
+ audio_input = gr.Audio(label="Upload Audio File", type="filepath", sources=["upload"])
 
 
 
 
336
  process_btn = gr.Button("Analyze Voice Emotions")
337
  with gr.Column(scale=2):
338
+ emotion_timeline = gr.Image(label="Emotion Timeline")
339
  with gr.Row():
340
+ audio_playback = gr.Audio(label="Processed Audio")
341
  emotion_summary = gr.Markdown(label="Emotion Summary")
342
  with gr.Row():
343
  emotion_results = gr.DataFrame(
344
  headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
345
+ label="Detailed Emotion Analysis",
346
  )
347
  process_btn.click(
348
  fn=process_audio,
349
  inputs=[audio_input],
350
+ outputs=[emotion_timeline, audio_playback, emotion_summary, emotion_results],
351
  )
352
+
353
  with gr.TabItem("Record Voice"):
354
  with gr.Row():
355
  with gr.Column(scale=1):
356
+ record_input = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath")
 
 
 
 
357
  analyze_btn = gr.Button("Analyze Recording")
358
  with gr.Column(scale=2):
359
+ rec_emotion_timeline = gr.Image(label="Emotion Timeline")
360
  with gr.Row():
361
+ rec_audio_playback = gr.Audio(label="Processed Audio")
362
  rec_emotion_summary = gr.Markdown(label="Emotion Summary")
363
  with gr.Row():
364
  rec_emotion_results = gr.DataFrame(
365
  headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
366
+ label="Detailed Emotion Analysis",
367
  )
368
  analyze_btn.click(
369
  fn=process_audio,
370
  inputs=[record_input],
371
+ outputs=[rec_emotion_timeline, rec_audio_playback, rec_emotion_summary, rec_emotion_results],
372
  )
373
+
374
  gr.Markdown("""
375
  ### How to Use
376
+
377
  1. **Upload Audio Tab:** Upload an audio file and click "Analyze Voice Emotions".
378
  2. **Record Voice Tab:** Record your voice and click "Analyze Recording".
379
+
380
  **Tips:**
381
  - Use clear recordings with minimal background noise.
382
  - Longer recordings yield more consistent results.
383
  """)
384
 
385
+
386
  def initialize_app():
387
  print("Initializing voice emotion analysis app...")
388
  if load_emotion_model():
 
390
  else:
391
  print("Failed to load emotion model.")
392
 
393
+
394
  if __name__ == "__main__":
395
  initialize_app()
396
+ demo.launch()