# voice_emotion_classification.py

import os
import subprocess
import sys
import time
import tempfile
import warnings
warnings.filterwarnings("ignore")


def run_pip(*args):
    """Run a pip install command and raise on failure."""
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir"] + list(args))


# ── Phase 1: Install packages ─────────────────────────────────────────────────
# FIX 1: Use importlib-based checks instead of deprecated pkg_resources.
# FIX 2: torch → CPU-only wheel (~190 MB vs ~900 MB CUDA) to avoid disk quota.
# FIX 3: transformers pinned to 4.46.3 (last v4); v5 dropped audio-classification
#         pipeline support for many models AND is much larger on disk.
# FIX 4: torchaudio pulled without the CUDA index so it stays CPU-only too.

print("=== Installing gradio (if needed) ===")
try:
    import gradio  # noqa: F401
    print("gradio already installed.")
except ImportError:
    run_pip("gradio")

print("=== Installing torch CPU-only (if needed) ===")
try:
    import torch  # noqa: F401
    print("torch already installed.")
except ImportError:
    run_pip("torch", "torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu")

print("=== Installing torchaudio (if needed) ===")
try:
    import torchaudio  # noqa: F401
    print("torchaudio already installed.")
except ImportError:
    run_pip("torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu")

print("=== Installing transformers 4.46.3 (if needed) ===")
try:
    import transformers as _tf
    if _tf.__version__ != "4.46.3":
        raise ImportError("wrong version")
    print("transformers 4.46.3 already installed.")
except (ImportError, AttributeError):
    run_pip("transformers==4.46.3")

print("=== Installing remaining packages (if needed) ===")
for pkg in ["librosa", "scipy", "matplotlib", "pydub"]:
    try:
        __import__(pkg)
        print(f"{pkg} already installed.")
    except ImportError:
        run_pip(pkg)

# ── Phase 2: Patch transformers get_session → requests.Session ───────────────
# transformers 4.46.3 calls get_session().head(..., allow_redirects=, proxies=, ...)
# In this environment get_session() returns an httpx.Client (gradio depends on
# httpx), which rejects every requests-style kwarg.
# Fix: replace get_session in the already-imported module namespace so it always
# returns a plain requests.Session, which accepts all those kwargs natively.

import transformers.utils.hub as _t_hub  # noqa: E402
import requests as _requests              # noqa: E402

_t_hub.get_session = lambda: _requests.Session()
print("Patched transformers.utils.hub.get_session → requests.Session()")

# ── Phase 3: Safe imports ─────────────────────────────────────────────────────

import numpy as np
import gradio as gr
import torch
import torchaudio
import librosa
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pydub import AudioSegment
import scipy
import io
from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
from pathlib import Path

# ── Emotion metadata ──────────────────────────────────────────────────────────

EMOTION_DESCRIPTIONS = {
    "angry":   "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.",
    "disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.",
    "fear":    "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.",
    "happy":   "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.",
    "neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.",
    "sad":     "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.",
    "surprise":"Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic.",
}

TONE_MAPPING = {
    "positive": ["happy", "surprise"],
    "neutral":  ["neutral"],
    "negative": ["angry", "sad", "fear", "disgust"],
}

MODEL_TO_EMOTION_MAP = {
    "hap": "happy",
    "ang": "angry",
    "sad": "sad",
    "dis": "disgust",
    "fea": "fear",
    "neu": "neutral",
    "sur": "surprise",
}

# ── Model loading ─────────────────────────────────────────────────────────────

audio_emotion_classifier = None

def load_emotion_model():
    global audio_emotion_classifier
    if audio_emotion_classifier is None:
        try:
            print("Loading emotion classification model...")
            model_name = "superb/hubert-large-superb-er"
            audio_emotion_classifier = pipeline("audio-classification", model=model_name)
            print("Emotion classification model loaded successfully")
            return True
        except Exception as e:
            print(f"Error loading emotion model: {e}")
            return False
    return True

# ── Audio helpers ─────────────────────────────────────────────────────────────

def convert_audio_to_wav(audio_file):
    try:
        audio = AudioSegment.from_file(audio_file)
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
            audio.export(tmp.name, format="wav")
            return tmp.name
    except Exception as e:
        print(f"Error converting audio: {e}")
        return None

def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=5):
    if not load_emotion_model():
        return None, None, "Failed to load emotion classification model.", None

    audio_path = audio_file if audio_file.endswith('.wav') else convert_audio_to_wav(audio_file)
    if not audio_path:
        return None, None, "Failed to process audio file. Unsupported format or corrupted file.", None

    try:
        audio_data, sample_rate = librosa.load(audio_path, sr=16000)
        chunk_samples = int(chunk_duration * sample_rate)
        num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))

        all_emotions, time_points = [], []

        for i in range(num_chunks):
            progress((i + 1) / num_chunks, "Analyzing audio emotions...")
            start_idx = i * chunk_samples
            end_idx = min(start_idx + chunk_samples, len(audio_data))
            chunk = audio_data[start_idx:end_idx]

            if len(chunk) < 0.5 * sample_rate:
                continue

            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
                chunk_path = tmp.name
                scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))

            results = audio_emotion_classifier(chunk_path)
            os.unlink(chunk_path)
            all_emotions.append(results)
            time_points.append((start_idx / sample_rate, end_idx / sample_rate))

        fig, detailed_results = generate_emotion_timeline(all_emotions, time_points, len(audio_data) / sample_rate)
        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
            img_path = tmp.name
            fig.savefig(img_path, dpi=100, bbox_inches='tight')
            plt.close(fig)

        summary = generate_emotion_summary(all_emotions, time_points)
        return img_path, audio_path, summary, detailed_results

    except Exception as e:
        import traceback
        traceback.print_exc()
        return None, None, f"Error analyzing audio: {str(e)}", None

# ── Visualisation & summary ───────────────────────────────────────────────────

def generate_emotion_timeline(all_emotions, time_points, duration):
    emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
    emotion_counts = {}

    for emotions in all_emotions:
        if not emotions:
            continue
        top = max(emotions, key=lambda x: x['score'])
        raw = top['label'].lower().strip()
        canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
        emotion_counts[canonical] = emotion_counts.get(canonical, 0) + 1

    total = len(all_emotions)
    emotion_percentages = {e: (emotion_counts.get(e, 0) / total * 100) for e in emotion_labels}

    sorted_emotions = sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True)

    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[3, 1],
                                   gridspec_kw={'hspace': 0.3})

    emotions_labels_disp = [item[0].capitalize() for item in sorted_emotions]
    percentages = [item[1] for item in sorted_emotions]
    colors = ['red', 'brown', 'purple', 'green', 'gray', 'blue', 'orange']
    bar_colors = (colors + ['#666666'] * max(0, len(emotions_labels_disp) - len(colors)))[:len(emotions_labels_disp)]

    bars = ax1.bar(emotions_labels_disp, percentages, color=bar_colors)
    for bar in bars:
        h = bar.get_height()
        ax1.annotate(f'{h:.1f}%',
                     xy=(bar.get_x() + bar.get_width() / 2, h),
                     xytext=(0, 3), textcoords="offset points",
                     ha='center', va='bottom')
    ax1.set_ylim(0, 100)
    ax1.set_ylabel('Percentage (%)')
    ax1.set_title('Emotion Distribution')
    ax1.grid(axis='y', linestyle='--', alpha=0.7)

    tone_percentages = {"positive": 0, "neutral": 0, "negative": 0}
    for emotion, pct in emotion_percentages.items():
        for tone, elist in TONE_MAPPING.items():
            if emotion in elist:
                tone_percentages[tone] += pct

    tone_colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'}
    tone_bars = ax2.bar(list(tone_percentages.keys()),
                        list(tone_percentages.values()),
                        color=[tone_colors[t] for t in tone_percentages])
    for bar in tone_bars:
        h = bar.get_height()
        if h > 0:
            ax2.annotate(f'{h:.1f}%',
                         xy=(bar.get_x() + bar.get_width() / 2, h),
                         xytext=(0, 3), textcoords="offset points",
                         ha='center', va='bottom')
    ax2.set_ylim(0, 100)
    ax2.set_ylabel('Percentage (%)')
    ax2.set_title('Tone Analysis')
    ax2.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()

    detailed_results = []
    for emotions, (start_time, end_time) in zip(all_emotions, time_points):
        if not emotions:
            continue
        top = max(emotions, key=lambda x: x['score'])
        raw = top['label'].lower().strip()
        canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
        tone = next((t for t, el in TONE_MAPPING.items() if canonical in el), "unknown")
        detailed_results.append({
            'Time Range':   f"{start_time:.1f}s - {end_time:.1f}s",
            'Emotion':      canonical,
            'Tone':         tone.capitalize(),
            'Confidence':   f"{top['score']:.2f}",
            'Description':  EMOTION_DESCRIPTIONS.get(canonical, ""),
        })

    return fig, detailed_results

def generate_emotion_summary(all_emotions, time_points):
    if not all_emotions:
        return "No emotional content detected."

    emotion_counts = {}
    total = len(all_emotions)
    for emotions in all_emotions:
        if not emotions:
            continue
        top = max(emotions, key=lambda x: x['score'])
        raw = top['label'].lower().strip()
        canonical = MODEL_TO_EMOTION_MAP.get(raw, raw)
        emotion_counts[canonical] = emotion_counts.get(canonical, 0) + 1

    if not emotion_counts:
        return "No emotional content detected."

    emotion_percentages = {e: (c / total * 100) for e, c in emotion_counts.items()}
    dominant = max(emotion_percentages, key=lambda x: emotion_percentages[x])

    summary = "### Voice Emotion Analysis Summary\n\n"
    summary += f"**Dominant emotion:** {dominant.capitalize()} ({emotion_percentages[dominant]:.1f}%)\n\n"
    summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant, '')}\n\n"
    summary += "**Emotion distribution:**\n"
    for emotion, pct in sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True):
        summary += f"- {emotion.capitalize()}: {pct:.1f}%\n"
    summary += f"\n**Interpretation:** The voice predominantly expresses {dominant} emotion"
    return summary

# ── Gradio handlers ───────────────────────────────────────────────────────────

def process_audio(audio_file, progress=gr.Progress()):
    if audio_file is None:
        return None, None, "No audio file provided.", None
    img_path, processed_audio, summary, results = analyze_audio_emotions(audio_file, progress)
    if img_path is None:
        return None, None, summary or "Failed to analyze audio emotions.", None
    return img_path, processed_audio, summary, results

# ── Gradio UI ─────────────────────────────────────────────────────────────────

with gr.Blocks(title="Voice Emotion Analysis System") as demo:
    gr.Markdown("""
    # 🎙️ Voice Emotion Analysis System

    This app analyzes the emotional content of voice recordings.

    It detects emotions including:

    * 😡 **Anger** &nbsp; 🤢 **Disgust** &nbsp; 😨 **Fear** &nbsp; 😊 **Happiness**
    * 😐 **Neutral** &nbsp; 😢 **Sadness** &nbsp; 😲 **Surprise**

    And provides a detailed analysis and timeline.
    """)

    with gr.Tabs():
        with gr.TabItem("Upload Audio"):
            with gr.Row():
                with gr.Column(scale=1):
                    audio_input = gr.Audio(label="Upload Audio File", type="filepath", sources=["upload"])
                    process_btn = gr.Button("Analyze Voice Emotions")
                with gr.Column(scale=2):
                    emotion_timeline = gr.Image(label="Emotion Timeline")
            with gr.Row():
                audio_playback = gr.Audio(label="Processed Audio")
                emotion_summary = gr.Markdown(label="Emotion Summary")
            with gr.Row():
                emotion_results = gr.DataFrame(
                    headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
                    label="Detailed Emotion Analysis",
                )
            process_btn.click(
                fn=process_audio,
                inputs=[audio_input],
                outputs=[emotion_timeline, audio_playback, emotion_summary, emotion_results],
            )

        with gr.TabItem("Record Voice"):
            with gr.Row():
                with gr.Column(scale=1):
                    record_input = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath")
                    analyze_btn = gr.Button("Analyze Recording")
                with gr.Column(scale=2):
                    rec_emotion_timeline = gr.Image(label="Emotion Timeline")
            with gr.Row():
                rec_audio_playback = gr.Audio(label="Processed Audio")
                rec_emotion_summary = gr.Markdown(label="Emotion Summary")
            with gr.Row():
                rec_emotion_results = gr.DataFrame(
                    headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
                    label="Detailed Emotion Analysis",
                )
            analyze_btn.click(
                fn=process_audio,
                inputs=[record_input],
                outputs=[rec_emotion_timeline, rec_audio_playback, rec_emotion_summary, rec_emotion_results],
            )

    gr.Markdown("""
    ### How to Use

    1. **Upload Audio Tab:** Upload an audio file and click "Analyze Voice Emotions".
    2. **Record Voice Tab:** Record your voice and click "Analyze Recording".

    **Tips:**
    - Use clear recordings with minimal background noise.
    - Longer recordings yield more consistent results.
    """)


def initialize_app():
    print("Initializing voice emotion analysis app...")
    if load_emotion_model():
        print("Emotion model loaded successfully!")
    else:
        print("Failed to load emotion model.")


if __name__ == "__main__":
    initialize_app()
    demo.launch()