Spaces:

Isshi14
/

voiceverse-ai

Sleeping

File size: 9,636 Bytes

8c369f8

"""

VoiceVerse AI — Main Application.



Gradio-based UI that orchestrates the full document-to-audio pipeline:

  1. Upload PDF/TXT → extract text

  2. RAG: chunk, embed, retrieve relevant context

  3. Generate a spoken-style script via Mistral-7B-Instruct

  4. Convert script to expressive audio via Qwen TTS / Edge-TTS

  5. Play audio in the browser



Entry point for Hugging Face Spaces deployment.

"""

import os
import gradio as gr
from utils import logger, validate_file, format_error
from rag import extract_text, RAGStore
from script_gen import generate_script
from tts import generate_audio

# ── Global RAG Store (single-user demo) ──────────────────────────────────────
rag_store = RAGStore()


# ── Pipeline Orchestration ───────────────────────────────────────────────────

def process_document(file, progress=gr.Progress()):
    """

    Full pipeline: upload → extract → RAG → script → audio.



    Args:

        file: Gradio uploaded file object (has .name attribute)



    Returns:

        Tuple of (script_text, audio_file_path, status_message)

    """
    # ── Step 0: Validate ─────────────────────────────────────────────────
    if file is None:
        raise gr.Error("Please upload a PDF or TXT file first.")

    file_path = file.name if hasattr(file, "name") else str(file)
    is_valid, msg = validate_file(file_path)
    if not is_valid:
        raise gr.Error(msg)

    try:
        # ── Step 1: Extract Text ─────────────────────────────────────────
        progress(0.1, desc="📄 Extracting text from document...")
        logger.info("Processing file: %s", file_path)

        text = extract_text(file_path)
        if not text or len(text.strip()) < 50:
            raise gr.Error(
                "The document contains too little text to generate audio. "
                "Please upload a document with more content."
            )

        progress(0.2, desc="✅ Text extracted successfully")

        # ── Step 2: RAG — Chunk & Embed ──────────────────────────────────
        progress(0.3, desc="🧠 Processing document with AI...")
        rag_store.add_document(text)

        chunk_count = len(rag_store.chunks)
        logger.info("Document processed: %d chunks created", chunk_count)

        # ── Step 3: Retrieve Context ─────────────────────────────────────
        progress(0.4, desc="🔍 Retrieving key content...")

        # For short documents, use all chunks; for longer ones, retrieve smartly
        if chunk_count <= 8:
            context_chunks = rag_store.get_all_chunks()
        else:
            context_chunks = rag_store.query(
                "What are the main topics, key insights, and important details?",
                top_k=6,
            )

        progress(0.5, desc="✅ Context retrieved")

        # ── Step 4: Generate Script ──────────────────────────────────────
        progress(0.6, desc="✍️ Writing spoken script...")

        script = generate_script(context_chunks)
        logger.info("Script generated: %d characters", len(script))

        progress(0.75, desc="✅ Script ready")

        # ── Step 5: Generate Audio ───────────────────────────────────────
        progress(0.8, desc="🎙️ Generating expressive audio...")

        audio_path, engine = generate_audio(script)
        logger.info("Audio generated via %s: %s", engine, audio_path)

        progress(1.0, desc="✅ Audio ready!")

        # ── Build status message ─────────────────────────────────────────
        status = (
            f"✅ **Generation complete!**\n\n"
            f"- 📄 Document: {os.path.basename(file_path)}\n"
            f"- 📝 Text extracted: {len(text):,} characters\n"
            f"- 🧩 Chunks created: {chunk_count}\n"
            f"- ✍️ Script length: {len(script):,} characters\n"
            f"- 🎙️ Voice engine: {engine}\n"
        )

        return script, audio_path, status

    except gr.Error:
        raise  # Re-raise Gradio errors as-is
    except EnvironmentError as e:
        raise gr.Error(str(e))
    except Exception as e:
        error_msg = format_error("pipeline", e)
        raise gr.Error(error_msg)


# ── Gradio UI ────────────────────────────────────────────────────────────────

def build_ui() -> gr.Blocks:
    """Build and return the Gradio Blocks interface."""

    # Custom CSS for a clean, polished look
    css = """

    .main-header {

        text-align: center;

        margin-bottom: 1rem;

    }

    .main-header h1 {

        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);

        -webkit-background-clip: text;

        -webkit-text-fill-color: transparent;

        font-size: 2.5rem;

        font-weight: 800;

        margin-bottom: 0.25rem;

    }

    .main-header p {

        color: #6b7280;

        font-size: 1.1rem;

    }

    .status-box {

        border-left: 3px solid #667eea;

        padding-left: 1rem;

        margin: 0.5rem 0;

    }

    """

    with gr.Blocks(
        title="VoiceVerse AI — Document to Audio",
        theme=gr.themes.Soft(
            primary_hue="indigo",
            secondary_hue="purple",
        ),
        css=css,
    ) as app:

        # ── Header ───────────────────────────────────────────────────────
        gr.HTML("""

        <div class="main-header">

            <h1>🎙️ VoiceVerse AI</h1>

            <p>Transform your documents into engaging podcast-style audio</p>

        </div>

        """)

        with gr.Row():
            # ── Left Column: Input ───────────────────────────────────────
            with gr.Column(scale=1):
                gr.Markdown("### 📤 Upload Document")

                file_input = gr.File(
                    label="Upload a PDF or TXT file",
                    file_types=[".pdf", ".txt"],
                    type="filepath",
                    elem_id="file-upload",
                )

                generate_btn = gr.Button(
                    "🎙️ Generate Audio",
                    variant="primary",
                    size="lg",
                    elem_id="generate-btn",
                )

                status_output = gr.Markdown(
                    value="*Upload a document and click Generate to start.*",
                    elem_classes=["status-box"],
                )

            # ── Right Column: Output ─────────────────────────────────────
            with gr.Column(scale=1):
                gr.Markdown("### 🎧 Generated Audio")

                audio_output = gr.Audio(
                    label="Audio Narration",
                    type="filepath",
                    elem_id="audio-player",
                    interactive=False,
                )

                gr.Markdown("### ✍️ Generated Script")

                script_output = gr.Textbox(
                    label="Spoken Script",
                    lines=12,
                    max_lines=20,
                    interactive=False,
                    placeholder="The generated script will appear here...",
                    elem_id="script-display",
                )

        # ── Wire up the generate button ──────────────────────────────────
        generate_btn.click(
            fn=process_document,
            inputs=[file_input],
            outputs=[script_output, audio_output, status_output],
        )

        # ── Footer ───────────────────────────────────────────────────────
        gr.Markdown(
            "<center style='color: #9ca3af; margin-top: 1rem;'>"
            "Built with ❤️ using Mistral-7B-Instruct · Qwen3-TTS · Edge-TTS · Gradio"
            "</center>"
        )

    return app


# ── Entry Point ──────────────────────────────────────────────────────────────

if __name__ == "__main__":
    logger.info("Starting VoiceVerse AI...")

    app = build_ui()
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True,
    )