voice-cloner-2

Sleeping

File size: 39,788 Bytes

import os
import sys
import threading
import numpy as np
import soundfile as sf
import shutil
import librosa
import gradio as gr

# torch and NeuTTSAir imported lazily in get_tts() to avoid slow startup / OOM on Render

# ---------------------------
# eSpeak check (Windows + Linux)
# ---------------------------
def check_espeak_installed():
    # If already set (e.g. by Docker), trust it
    if os.environ.get("PHONEMIZER_ESPEAK_LIBRARY") and os.path.exists(os.environ["PHONEMIZER_ESPEAK_LIBRARY"]):
        print(f"Using espeak library from env: {os.environ['PHONEMIZER_ESPEAK_LIBRARY']}")
        return True

    # Linux: look for libespeak-ng.so in common locations
    if sys.platform != "win32":
        so_names = ["libespeak-ng.so", "libespeak-ng.so.1", "libespeak.so"]
        search_dirs = ["/usr/lib", "/usr/lib/x86_64-linux-gnu", "/usr/local/lib"]
        for d in search_dirs:
            if not os.path.isdir(d):
                continue
            for name in so_names:
                candidate = os.path.join(d, name)
                if os.path.exists(candidate):
                    os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = candidate
                    print(f"Found espeak library at: {candidate}")
                    return True
        if shutil.which("espeak-ng") or shutil.which("espeak"):
            print("Found espeak-ng in PATH (phonemizer may use default library)")
            return True
        print("\nError: espeak-ng not found! On Linux install with: apt-get install espeak-ng libespeak-ng-dev")
        return False

    # Windows
    possible_paths = [
        "C:\\Program Files\\eSpeak NG",
        "C:\\Program Files (x86)\\eSpeak NG",
        "C:\\Program Files\\eSpeak",
        "C:\\Program Files (x86)\\eSpeak",
    ]
    dll_names = ['libespeak-ng.dll', 'espeak-ng.dll', 'libespeak.dll', 'espeak.dll']
    for exe_cmd in ['espeak-ng', 'espeak']:
        exe_path = shutil.which(exe_cmd)
        if exe_path:
            print(f"Found {exe_cmd} in PATH at: {exe_path}")
            exe_dir = os.path.dirname(exe_path)
            for dll in dll_names:
                candidate = os.path.join(exe_dir, dll)
                if os.path.exists(candidate):
                    os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = candidate
                    print(f"Found espeak shared library at: {candidate}")
                    return True

    for path in possible_paths:
        if os.path.exists(path):
            for root, _, files in os.walk(path):
                for dll in dll_names:
                    candidate = os.path.join(root, dll)
                    if os.path.exists(candidate):
                        os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = candidate
                        os.environ['PATH'] = f"{path};{os.environ['PATH']}"
                        return True
            bin_path = os.path.join(path, 'espeak-ng.exe')
            if os.path.exists(bin_path):
                os.environ['PATH'] = f"{path};{os.environ['PATH']}"
                break

    print("\nError: espeak-ng not found!")
    print("Install from https://github.com/espeak-ng/espeak-ng/releases")
    return False


if not check_espeak_installed():
    sys.exit(1)

# ---------------------------
# Model initialization (deferred so server can bind to PORT first for Render)
# ---------------------------
tts = None
_tts_lock = threading.Lock()


def get_tts():
    """Load TTS model on first use so the Gradio server can start and bind to PORT immediately."""
    global tts
    with _tts_lock:
        if tts is not None:
            return tts
        import torch
        from neuttsair.neutts import NeuTTSAir
        print("\nLoading TTS model (first use)...")
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()
            print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB total")
        project_root = os.path.abspath(os.path.dirname(__file__))
        local_backbone = os.path.join(project_root, "Models", "neutts-air")

        def _resolve_hf_snapshot(root_path: str) -> str:
            try:
                for name in os.listdir(root_path):
                    if name.startswith("models--"):
                        models_dir = os.path.join(root_path, name)
                        snapshots_dir = os.path.join(models_dir, "snapshots")
                        if os.path.isdir(snapshots_dir):
                            for snap in os.listdir(snapshots_dir):
                                snap_path = os.path.join(snapshots_dir, snap)
                                if os.path.exists(os.path.join(snap_path, "config.json")):
                                    print(f"Found model in snapshots: {snap_path}")
                                    return snap_path
            except Exception as e:
                print(f"Warning: Error resolving model path: {e}")
            return root_path

        # Use full transformers model (neuphonic/neutts-air) to avoid llama-cpp build on cloud
        backbone_arg = _resolve_hf_snapshot(local_backbone) if os.path.isdir(local_backbone) else "neuphonic/neutts-air"
        print(f"Using backbone: {backbone_arg}")
        print(f"Using codec: neuphonic/neucodec")
        if not torch.cuda.is_available():
            backbone_device = "cpu"
            codec_device = "cpu"
            print("No CUDA GPU detected. Using CPU for backbone and codec.")
        else:
            backbone_device = "cuda"
            codec_device = "cuda"
            gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
            if gpu_memory_gb <= 4.5:
                print(f"Detected {gpu_memory_gb:.2f} GB GPU. Loading codec on CPU to save GPU memory.")
                codec_device = "cpu"
        tts = NeuTTSAir(
            backbone_repo=backbone_arg,
            backbone_device=backbone_device,
            codec_repo="neuphonic/neucodec",
            codec_device=codec_device,
        )
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        print("TTS model loaded.")
        return tts

# ---------------------------
# Voice loading logic
# ---------------------------
VOICES = {"samples": {}}
voice_dir = "samples"
os.makedirs(voice_dir, exist_ok=True)

for name in os.listdir(voice_dir):
    if name.endswith(".txt"):
        base = os.path.splitext(name)[0]
        txt_path = os.path.join(voice_dir, f"{base}.txt")
        wav_path = os.path.join(voice_dir, f"{base}.wav")
        pt_path = os.path.join(voice_dir, f"{base}.pt")

        if os.path.exists(txt_path) and (os.path.exists(wav_path) or os.path.exists(pt_path)):
            VOICES["samples"][base] = (txt_path, wav_path if os.path.exists(wav_path) else pt_path)

def format_voice_choice(name):
    return f"Voice: {name}"

# ---------------------------
# Core functions
# ---------------------------
def load_reference(voice_name):
    import torch
    txt_path, audio_or_pt = VOICES["samples"][voice_name]
    ref_text = open(txt_path, "r").read().strip()
    if audio_or_pt.endswith(".pt"):
        ref_codes = torch.load(audio_or_pt)
    else:
        ref_codes = get_tts().encode_reference(audio_or_pt)
    return ref_text, ref_codes


def split_text_into_chunks(text, max_length=150):
    """Split text into smaller chunks preserving sentence and punctuation structure."""
    import re
    
    # Clean up the text first
    text = text.strip()
    if not text:
        return []

    # Split by sentence-ending punctuation while preserving the punctuation
    sentence_pattern = r'([.!?]+)'
    parts = re.split(sentence_pattern, text)

    # Reconstruct sentences with their punctuation
    sentences = []
    i = 0
    while i < len(parts):
        if parts[i].strip():
            sentence = parts[i].strip()
            # Add punctuation if it exists
            if i + 1 < len(parts) and parts[i + 1].strip():
                sentence += parts[i + 1]
                i += 2
            else:
                # If no punctuation follows, add a period (only once)
                if not sentence.endswith(('.', '!', '?')):
                    sentence += '.'
                i += 1
            sentences.append(sentence)
        else:
            i += 1

    # ✅ FIX: Avoid adding the last part twice when no punctuation present
    if len(parts) > 0 and parts[-1].strip():
        last_part = parts[-1].strip()
        # Add only if it's not already included
        if not any(last_part in s or s.startswith(last_part) for s in sentences):
            if not last_part.endswith(('.', '!', '?')):
                last_part += '.'
            sentences.append(last_part)

    # Group sentences into chunks
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        # If single sentence exceeds max_length, split by commas
        if len(sentence) > max_length:
            comma_parts = re.split(r'(,)', sentence)
            temp_sentence = ""
            
            i = 0
            while i < len(comma_parts):
                part = comma_parts[i].strip()
                comma = comma_parts[i + 1] if i + 1 < len(comma_parts) else ''
                
                # If part is still too long, split by words
                if len(part) > max_length:
                    words = part.split()
                    temp_words = []
                    
                    for word in words:
                        test_chunk = ' '.join(temp_words + [word])
                        if len(test_chunk) > max_length and temp_words:
                            if current_chunk:
                                chunks.append(current_chunk.strip())
                                current_chunk = ""
                            chunks.append(' '.join(temp_words))
                            temp_words = [word]
                        else:
                            temp_words.append(word)
                    
                    if temp_words:
                        part = ' '.join(temp_words) + comma
                        if current_chunk and len(current_chunk + ' ' + part) > max_length:
                            chunks.append(current_chunk.strip())
                            current_chunk = part
                        else:
                            current_chunk += (' ' if current_chunk else '') + part
                else:
                    part_with_comma = part + comma
                    if current_chunk and len(current_chunk + ' ' + part_with_comma) > max_length:
                        chunks.append(current_chunk.strip())
                        current_chunk = part_with_comma
                    else:
                        current_chunk += (' ' if current_chunk else '') + part_with_comma
                
                i += 2 if i + 1 < len(comma_parts) else 1
        else:
            # Normal sentence that fits within limit
            if current_chunk and len(current_chunk + ' ' + sentence) > max_length:
                chunks.append(current_chunk.strip())
                current_chunk = sentence
            else:
                current_chunk += (' ' if current_chunk else '') + sentence

    # CRITICAL: Always add remaining chunk at the end
    if current_chunk.strip():
        chunks.append(current_chunk.strip())

    # Filter out empty or duplicate chunks ✅
    final_chunks = []
    for chunk in chunks:
        if chunk.strip() and (not final_chunks or chunk.strip() != final_chunks[-1]):
            final_chunks.append(chunk.strip())

    return final_chunks


def process_chunk(chunk, ref_codes, ref_text, tts_model):
    """Process a single chunk of text and return the audio."""
    try:
        return tts_model.infer(chunk, ref_codes, ref_text)
    except Exception as e:
        # Swallow individual chunk errors and return None to let caller handle it
        return None

def estimate_generation_time(num_chunks):
    """Estimate the generation time based on number of chunks."""
    # Assuming average of 3 seconds per chunk plus overhead
    return num_chunks * 3 + 2

def format_time(seconds):
    """Format seconds into a readable time string."""
    if seconds < 60:
        return f"{seconds:.1f} seconds"
    minutes = int(seconds // 60)
    seconds = seconds % 60
    return f"{minutes} minute{'s' if minutes != 1 else ''} {seconds:.1f} seconds"

def generate_speech(text, voice_name, speed_control="1x"):
    try:
        import time

        # Input validations
        if not text or not text.strip():
            yield 0, None, "❌ Error: Input text cannot be empty.", None
            return

        if not voice_name:
            yield 0, None, "❌ Error: No voice selected. Please select a voice.", None
            return

        if voice_name not in VOICES["samples"]:
            yield 0, None, f"❌ Error: Voice '{voice_name}' not found.", None
            return

        # Convert speed control string to float
        try:
            speed = float(speed_control.rstrip('x'))
        except ValueError:
            speed = 1.0  # Default to 1x if conversion fails

        # Load TTS model on first use (deferred so server can bind to PORT first)
        yield 5, None, "Loading TTS model (first time may take a few minutes)...", None
        try:
            tts_instance = get_tts()
        except Exception as e:
            yield 0, None, f"❌ Failed to load TTS model: {str(e)}", None
            return

        start_time = time.time()
        yield 10, None, "Loading voice reference...", None
        ref_text, ref_codes = load_reference(voice_name)
        
        # Split text into smaller chunks for better processing
        chunks = split_text_into_chunks(text)
        total_chunks = len(chunks)
        
        if total_chunks == 0:
            raise ValueError("No text to process")
        
        # Estimate total time
        estimated_time = estimate_generation_time(total_chunks)
        status = f"Estimated time to completion: {format_time(estimated_time)}\nProcessing {total_chunks} chunks..."
        yield 15, None, status, None
            
        # Process each chunk and store with its index
        chunk_results = []
        for i, chunk in enumerate(chunks, 1):
            chunk_start = time.time()
            
            # Update progress
            progress = int(15 + (75 * i / total_chunks))
            
            # Calculate and show time statistics
            elapsed_time = time.time() - start_time
            if i > 1:
                avg_time_per_chunk = elapsed_time / (i - 1)
                remaining_chunks = total_chunks - (i - 1)
                estimated_remaining = avg_time_per_chunk * remaining_chunks
                status = (
                    f"Processing chunk {i}/{total_chunks}\n"
                    f"Progress: {progress}% complete\n"
                    f"Est. remaining: {format_time(estimated_remaining)}"
                )
            else:
                status = f"Processing chunk {i}/{total_chunks}\nProgress: {progress}% complete"
            
            yield progress, None, status, None
            
            # Generate audio for this chunk
            chunk_wav = process_chunk(chunk, ref_codes, ref_text, tts_instance)
            if chunk_wav is not None:
                # Store chunk with its index to maintain order
                chunk_results.append((i-1, chunk_wav))

        if not chunk_results:
            raise ValueError("Failed to generate any audio")

        # Update status for final processing
        yield 90, None, "Finalizing audio...\nOrdering and combining chunks...", None

        # Sort chunks by their original index and extract the audio data
        chunk_results.sort(key=lambda x: x[0])  # Sort by index
        processed_chunks = [chunk[1] for chunk in chunk_results]  # Extract audio data in order

        # Create silence once
        silence = np.zeros(int(24000 * 0.25))  # 0.25 seconds silence between chunks

        # Concatenate all chunks with silence in between
        all_wav = processed_chunks[0]
        for chunk_wav in processed_chunks[1:]:
            all_wav = np.concatenate([all_wav, silence, chunk_wav])

        # Apply speed adjustment if needed (pitch-preserving time-stretching)
        if speed != 1.0:
            # Use librosa for pitch-preserving time-stretching
            # rate > 1 speeds up, rate < 1 slows down
           all_wav = librosa.effects.time_stretch(all_wav.astype(np.float32), rate=speed)
           
        

        # Save the final audio
        temp_path = "temp_output.wav"
        sf.write(temp_path, all_wav, 24000)
        
        # Calculate and show total time taken
        total_time = time.time() - start_time
        final_status = f"✅ Generation complete!\nTotal time: {format_time(total_time)}"
        
        yield 100, temp_path, final_status, None
    except Exception as e:
        error_status = f"❌ Error generating speech: {str(e)}"
        yield 0, None, error_status, None


def delete_voice(voice_name):
    """Deletes a voice and its associated files."""
    try:
        if voice_name not in VOICES["samples"]:
            return f"❌ Voice '{voice_name}' not found!", gr.update()

        txt_path = f"samples/{voice_name}.txt"
        wav_path = f"samples/{voice_name}.wav"
        pt_path = f"samples/{voice_name}.pt"

        # Remove files if they exist
        for path in [txt_path, wav_path, pt_path]:
            if os.path.exists(path):
                os.remove(path)

        # Remove from VOICES dictionary
        del VOICES["samples"][voice_name]
        
        remaining_voices = list(VOICES["samples"].keys())
        new_selected = remaining_voices[0] if remaining_voices else None
        
        return f"✅ Voice '{voice_name}' deleted successfully!", gr.update(choices=remaining_voices, value=new_selected)
    except Exception as e:
        return f"❌ Error deleting voice: {e}", gr.update()

def clone_voice(new_name, txt, audio_file):
    """Encodes a new reference voice and saves its embedding."""
    try:

        
        # Input validations
        if not new_name or not new_name.strip():
            return "❌ Error: New Voice name cannot be empty.", gr.update()
        
        if not txt or not txt.strip():
            return "❌ Error: Reference text cannot be empty.", gr.update()
            
        if not audio_file:
            return "❌ Error: No reference audio file provided.", gr.update()
            
        if new_name in VOICES["samples"]:
            return f"❌ Error: Voice '{new_name}' already exists. Please choose a different name.", gr.update()

        try:
            tts_instance = get_tts()
        except Exception as e:
            return f"❌ Failed to load TTS model: {str(e)}", gr.update()

        os.makedirs("samples", exist_ok=True)
        txt_path = f"samples/{new_name}.txt"
        wav_path = f"samples/{new_name}.wav"
        pt_path = f"samples/{new_name}.pt"

        # Save reference text and audio
        with open(txt_path, "w") as f:
            f.write(txt.strip())
        shutil.copy(audio_file, wav_path)

        ref_codes = tts_instance.encode_reference(wav_path)
        import torch
        torch.save(ref_codes, pt_path)

        VOICES["samples"][new_name] = (txt_path, pt_path)
        return f"✅ Voice '{new_name}' cloned and saved successfully!", gr.update(choices=list(VOICES["samples"].keys()), value=new_name)
    except Exception as e:
        return f"❌ Error cloning voice: {e}", gr.update()


# ---------------------------
# UI
# ---------------------------
# Custom CSS - consistent dark theme
custom_css = """
footer {display: none !important;}
.footer {display: none !important;}
#api-docs-link {display: none !important;}

/* Dark theme palette */
:root {
    --dark-bg: #1e1e2e;
    --dark-card: #252530;
    --dark-border: #3d3d4a;
    --text-primary: #e4e4e7;
    --text-muted: #a1a1aa;
    --accent: #818cf8;
    --accent-secondary: #a78bfa;
}

/* Modern header - dark gradient */
.heading-container {
    text-align: center;
    padding: 2rem 1rem;
    background: linear-gradient(135deg, #4338ca 0%, #6d28d9 50%, #4c1d95 100%);
    border-radius: 12px;
    margin-bottom: 2rem;
    border: 1px solid var(--dark-border);
    color: white;
}

.heading-container h1 {
    margin: 0;
    font-size: 2.5rem;
    font-weight: 700;
    color: white;
}

.heading-container h3 {
    margin: 0.5rem 0 0 0;
    font-size: 1.1rem;
    font-weight: 400;
    color: rgba(255, 255, 255, 0.9);
}

/* Cards - dark, same as rest of app */
.control-panel {
    background: var(--dark-card) !important;
    padding: 1.5rem;
    border-radius: 12px;
    border: 1px solid var(--dark-border);
    margin-bottom: 1rem;
    color: var(--text-primary) !important;
}

.control-panel label, .control-panel .label-wrap, .control-panel p,
.control-panel h1, .control-panel h2, .control-panel h3, .control-panel h4,
.control-panel span, .control-panel div, .control-panel li, .control-panel small,
.control-panel .markdown, .control-panel [class*="markdown"], .control-panel * {
    color: var(--text-primary) !important;
}

.output-panel {
    background: var(--dark-card) !important;
    padding: 1.5rem;
    border-radius: 12px;
    border: 1px solid var(--dark-border);
    box-shadow: 0 2px 12px rgba(0,0,0,0.3);
    color: var(--text-primary) !important;
}

.output-panel label, .output-panel .label-wrap, .output-panel p,
.output-panel h1, .output-panel h2, .output-panel h3, .output-panel h4,
.output-panel span, .output-panel div, .output-panel li, .output-panel small,
.output-panel .markdown, .output-panel [class*="markdown"], .output-panel * {
    color: var(--text-primary) !important;
}

/* Button styling */
.primary-button {
    width: 100%;
    padding: 0.75rem;
    font-size: 1.1rem;
    font-weight: 600;
    border-radius: 8px;
    margin-top: 1rem;
}

/* Progress bar styling */
.progress-container {
    margin: 1rem 0;
}

/* Status box - dark */
.status-box {
    background: var(--dark-bg) !important;
    border-radius: 8px;
    padding: 1rem;
    min-height: 80px;
    border: 1px solid var(--dark-border);
    color: var(--text-primary) !important;
}

/* Audio container - dark */
.audio-container {
    margin-top: 1rem;
    padding: 1rem;
    background: var(--dark-card) !important;
    border-radius: 8px;
    border: 1px solid var(--dark-border);
    color: var(--text-primary) !important;
}

.audio-container label, .audio-container .label-wrap,
.audio-container h1, .audio-container h2, .audio-container h3, .audio-container h4,
.audio-container span, .audio-container div, .audio-container .markdown,
.audio-container [class*="markdown"], .audio-container * {
    color: var(--text-primary) !important;
}

/* Upload/drop zone text */
.control-panel [class*="upload"] span,
.control-panel [class*="drop"] span,
.output-panel [class*="upload"] span,
.output-panel [class*="drop"] span {
    color: var(--text-muted) !important;
}

.info-text, .info-text * {
    color: var(--text-muted) !important;
}

/* Voice selection styling */
.voice-controls {
    display: flex;
    gap: 0.5rem;
    align-items: flex-end;
}

.tab-nav {
    margin-bottom: 1.5rem;
}

/* Instructions - dark card, same as panels */
.instructions-content {
    background: var(--dark-card) !important;
    padding: 2rem;
    border-radius: 12px;
    border: 1px solid var(--dark-border);
    line-height: 1.8;
    max-width: 1200px;
    margin: 0 auto;
    color: var(--text-primary) !important;
}

.instructions-content p,
.instructions-content li,
.instructions-content span,
.instructions-content div {
    color: var(--text-primary) !important;
}

.instructions-content *:not(h1):not(h2):not(h3) {
    color: var(--text-primary) !important;
}

.instructions-content h1 {
    color: var(--accent) !important;
    border-bottom: 3px solid var(--accent);
    padding-bottom: 0.5rem;
    margin-bottom: 1.5rem;
}

.instructions-content h2 {
    color: var(--accent-secondary) !important;
    margin-top: 2rem;
    margin-bottom: 1rem;
    font-size: 1.5rem;
}

.instructions-content h3 {
    color: #93c5fd !important;
    margin-top: 1.5rem;
    margin-bottom: 0.75rem;
    font-size: 1.2rem;
}

.instructions-content ul, .instructions-content ol {
    margin-left: 1.5rem;
    margin-bottom: 1rem;
}

.instructions-content li {
    margin-bottom: 0.5rem;
    color: var(--text-primary) !important;
}

.instructions-content code {
    background: var(--dark-bg);
    color: var(--accent);
    padding: 0.2rem 0.4rem;
    border-radius: 4px;
    font-family: 'Courier New', monospace;
    font-size: 0.9em;
    border: 1px solid var(--dark-border);
}

.instructions-content hr {
    border: none;
    border-top: 2px solid var(--dark-border);
    margin: 2rem 0;
}

.instructions-content blockquote {
    border-left: 4px solid var(--accent);
    padding-left: 1rem;
    margin-left: 0;
    color: var(--text-muted) !important;
    font-style: italic;
}
"""

with gr.Blocks(title="Virtual Lab Voice Cloning") as app:
    
    # Modern header with gradient
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown(
                """
                <div class="heading-container">
                    <h1>🎙️ Virtual Lab Voice Cloning</h1>
                    <h3>High-Quality Text-to-Speech with Voice Cloning</h3>
                </div>
                """,
                elem_classes="heading"
            )

    with gr.Tab("🎯 Generate Speech", elem_classes="tab-nav"):
        with gr.Row(equal_height=True):
            # Left Column - Input Controls
            with gr.Column(scale=1, min_width=400):
                gr.Markdown("### 📝 Input Settings", elem_classes="control-panel")
                
                text_input = gr.Textbox(
                    label="📄 Text to Convert",
                    placeholder="Enter the text you want to convert to speech...",
                    lines=6,
                    elem_classes="text-input"
                )
                
                with gr.Row(elem_classes="voice-controls"):
                    voice_select = gr.Dropdown(
                        label="🎤 Select Voice",
                        choices=list(VOICES["samples"].keys()),
                        value=list(VOICES["samples"].keys())[0] if VOICES["samples"] else None,
                        interactive=True,
                        scale=3
                    )
                    delete_btn = gr.Button(
                        "🗑️",
                        variant="secondary",
                        size="sm",
                        scale=1,
                        min_width=50
                    )
                
                speed_control = gr.Dropdown(
                    label="⚡ Speech Speed",
                    choices=["1x", "1.1x", "1.2x", "1.3x", "1.4x", "1.5x"],
                    value="1x",
                    info="Select playback speed (preserves pitch and voice characteristics)"
                )
                
                generate_btn = gr.Button(
                    "🎙️ Generate Speech",
                    variant="primary",
                    size="lg",
                    elem_classes="primary-button"
                )
            
            # Right Column - Output & Status
            with gr.Column(scale=1, min_width=400):
                gr.Markdown("### 📊 Generation Status", elem_classes="output-panel")
                
                progress_bar = gr.Slider(
                    label="Progress",
                    minimum=0,
                    maximum=100,
                    value=0,
                    interactive=False,
                    elem_classes="progress-container"
                )
                
                status_box = gr.Textbox(
                    label="Status Information",
                    value="Ready to generate speech. Enter text and select a voice.",
                    lines=4,
                    interactive=False,
                    elem_classes="status-box"
                )
                
                delete_status = gr.Textbox(label="Status", visible=False)
                
                gr.Markdown("### 🎵 Audio Output", elem_classes="audio-container")
                audio_output = gr.Audio(
                    label="Generated Audio",
                    autoplay=True
                )

        # Event handlers
        generate_btn.click(
            fn=generate_speech,
            inputs=[text_input, voice_select, speed_control],
            outputs=[progress_bar, audio_output, status_box, delete_status]
        )

        delete_btn.click(
            fn=delete_voice,
            inputs=[voice_select],
            outputs=[delete_status, voice_select]
        )

    with gr.Tab("🧬 Clone New Voice", elem_classes="tab-nav"):
        with gr.Row(equal_height=True):
            # Left Column - Voice Cloning Input
            with gr.Column(scale=1, min_width=400):
                gr.Markdown("### 🎤 Voice Cloning Setup", elem_classes="control-panel")
                
                new_voice_name = gr.Textbox(
                    label="📛 Voice Name",
                    placeholder="Enter a unique name for this voice...",
                    info="Choose a descriptive name for your cloned voice"
                )
                
                ref_text_input = gr.Textbox(
                    label="📝 Reference Text",
                    placeholder="Enter the exact text that is spoken in the audio sample...",
                    lines=4,
                    info="This should match the text spoken in your audio file"
                )
                
                ref_audio_input = gr.Audio(
                    label="🎵 Reference Audio File",
                    type="filepath"
                )
                gr.Markdown(
                    "<small>💡 Upload a WAV file containing the voice sample (recommended: 5-30 seconds)</small>",
                    elem_classes="info-text"
                )
                
                clone_btn = gr.Button(
                    "🧬 Clone Voice",
                    variant="primary",
                    size="lg",
                    elem_classes="primary-button"
                )
            
            # Right Column - Status
            with gr.Column(scale=1, min_width=400):
                gr.Markdown("### 📋 Cloning Status", elem_classes="output-panel")
                
                clone_status = gr.Textbox(
                    label="Status",
                    value="Ready to clone a new voice. Fill in the details on the left and upload an audio sample.",
                    lines=8,
                    interactive=False,
                    elem_classes="status-box"
                )
                
                gr.Markdown(
                    """
                    ### 💡 Tips for Best Results
                    - Use clear, high-quality audio recordings
                    - Ensure the reference text matches what's spoken
                    - Audio length: 5-30 seconds works best
                    - Speak naturally and clearly in the sample
                    - Avoid background noise when possible
                    """,
                    elem_classes="control-panel"
                )

        # Event handler
        clone_btn.click(
            fn=clone_voice,
            inputs=[new_voice_name, ref_text_input, ref_audio_input],
            outputs=[clone_status, voice_select]
        )

    with gr.Tab("📖 Instructions", elem_classes="tab-nav"):
        with gr.Column():
            gr.Markdown(
                """
                # 🎙️ Virtual Lab Voice Cloning - User Guide
                
                Welcome to the Virtual Lab Voice Cloning tool! This guide will help you get started with creating high-quality text-to-speech audio using voice cloning technology.
                
                ---
                
                ## 🎯 How to Generate Speech
                
                ### Step 1: Navigate to the "Generate Speech" Tab
                Click on the **"🎯 Generate Speech"** tab at the top of the interface.
                
                ### Step 2: Enter Your Text
                - Type or paste the text you want to convert to speech in the **"📄 Text to Convert"** text box
                - You can enter multiple sentences or paragraphs
                - The tool will automatically split long texts into manageable chunks
                
                ### Step 3: Select a Voice
                - Choose a voice from the **"🎤 Select Voice"** dropdown menu
                - Only voices that have been cloned and saved will appear in this list
                - You can delete a voice by clicking the 🗑️ button next to the voice selector
                
                ### Step 4: Adjust Speech Speed (Optional)
                - Use the **"⚡ Speech Speed"** dropdown to control playback speed
                - Options range from 1x (normal) to 1.5x (faster)
                - Speed adjustment preserves pitch and voice characteristics
                
                ### Step 5: Generate Audio
                - Click the **"🎙️ Generate Speech"** button
                - Monitor the progress bar and status messages
                - The generated audio will appear automatically when complete
                - You can play the audio directly in the browser or download it
                
                ---
                
                ## 🧬 How to Clone a New Voice
                
                ### Step 1: Navigate to the "Clone New Voice" Tab
                Click on the **"🧬 Clone New Voice"** tab at the top of the interface.
                
                ### Step 2: Prepare Your Audio Sample
                Before cloning, you'll need:
                - A clear audio recording (WAV format recommended)
                - 5-30 seconds of speech works best
                - High-quality audio with minimal background noise
                - Natural, clear speech
                
                ### Step 3: Enter Voice Details
                - **Voice Name**: Enter a unique, descriptive name for your cloned voice
                - **Reference Text**: Type the exact text that is spoken in your audio sample
                - **Reference Audio**: Upload your WAV audio file using the file uploader
                
                ### Step 4: Clone the Voice
                - Click the **"🧬 Clone Voice"** button
                - Wait for the cloning process to complete
                - Once successful, the new voice will be available in the voice selector
                
                ### Step 5: Use Your Cloned Voice
                - Navigate back to the "Generate Speech" tab
                - Your newly cloned voice will appear in the voice dropdown
                - Select it and generate speech as usual
                
                ---
                
                ## 💡 Best Practices & Tips
                
                ### For Voice Cloning:
                - ✅ Use high-quality, clear audio recordings
                - ✅ Ensure the reference text exactly matches what's spoken in the audio
                - ✅ Record in a quiet environment to minimize background noise
                - ✅ Speak naturally and at a normal pace
                - ✅ Use 5-30 seconds of audio for best results
                - ❌ Avoid very short clips (less than 3 seconds)
                - ❌ Avoid clips with heavy background noise or music
                - ❌ Don't use text that doesn't match the audio content
                
                ### For Speech Generation:
                - ✅ Use proper punctuation for better natural pauses
                - ✅ Break long texts into paragraphs for better processing
                - ✅ Review the generated audio and adjust speed if needed
                - ✅ The tool automatically handles long texts by splitting them into chunks
                - ✅ Generated audio is saved and can be downloaded
                
                ### Performance Tips:
                - The tool processes text in chunks for better performance
                - Longer texts will take more time to generate
                - Progress updates show estimated completion time
                - GPU acceleration is used when available for faster processing
                
                ---
                
                ## 🔧 Technical Information
                
                ### Supported Formats:
                - **Input Audio**: WAV format (recommended)
                - **Output Audio**: WAV format, 24kHz sample rate
                - **Text**: Plain text (UTF-8)
                
                ### System Requirements:
                - NVIDIA GPU recommended for best performance
                - CUDA support for GPU acceleration
                - eSpeak NG installed for phonemization
                
                ### Features:
                - High-quality neural text-to-speech
                - Voice cloning from short audio samples
                - Pitch-preserving speed control
                - Automatic text chunking for long inputs
                - Real-time progress tracking
                
                ---
                
                ## ❓ Troubleshooting
                
                ### Common Issues:
                
                **"No voice selected" error:**
                - Make sure you have cloned at least one voice
                - Check that the voice appears in the dropdown menu
                
                **"Input text cannot be empty" error:**
                - Ensure you've entered text in the text input box
                - Check for whitespace-only text
                
                **Audio generation fails:**
                - Verify your GPU has enough memory
                - Try generating shorter texts first
                - Check that the voice files are not corrupted
                
                **Voice cloning fails:**
                - Ensure the audio file is in WAV format
                - Verify the reference text matches the audio content
                - Check that the audio quality is sufficient
                - Make sure the voice name is unique
                
                ---
                
                ## 📝 Notes
                
                - All cloned voices are saved locally in the `samples` folder
                - Generated audio files are temporary and should be downloaded if you want to keep them
                - The tool uses advanced neural networks for high-quality voice synthesis
                - Processing time depends on text length and system performance
                
                ---
                
                **Enjoy creating amazing voice clones! 🎉**
                """,
                elem_classes="instructions-content"
            )

if __name__ == "__main__":
    # Always bind to 0.0.0.0 so Render can detect the port (default PORT=10000)
    port = int(os.environ.get("PORT", "7860"))
    server_name = "0.0.0.0"
    inbrowser = os.environ.get("PORT") is None  # open browser only when not in cloud

    print(f"\nLaunching on http://{server_name}:{port}")
    app.launch(
        server_name=server_name,
        server_port=port,
        share=False,
        inbrowser=inbrowser,
        show_error=True,
        theme=gr.themes.Soft(primary_hue="purple", secondary_hue="blue"),
        css=custom_css,
    )