""" VoiceVerse AI — Main Application. Gradio-based UI that orchestrates the full document-to-audio pipeline: 1. Upload PDF/TXT → extract text 2. RAG: chunk, embed, retrieve relevant context 3. Generate a spoken-style script via Mistral-7B-Instruct 4. Convert script to expressive audio via Qwen TTS / Edge-TTS 5. Play audio in the browser Entry point for Hugging Face Spaces deployment. """ import os import gradio as gr from utils import logger, validate_file, format_error from rag import extract_text, RAGStore from script_gen import generate_script from tts import generate_audio # ── Global RAG Store (single-user demo) ────────────────────────────────────── rag_store = RAGStore() # ── Pipeline Orchestration ─────────────────────────────────────────────────── def process_document(file, progress=gr.Progress()): """ Full pipeline: upload → extract → RAG → script → audio. Args: file: Gradio uploaded file object (has .name attribute) Returns: Tuple of (script_text, audio_file_path, status_message) """ # ── Step 0: Validate ───────────────────────────────────────────────── if file is None: raise gr.Error("Please upload a PDF or TXT file first.") file_path = file.name if hasattr(file, "name") else str(file) is_valid, msg = validate_file(file_path) if not is_valid: raise gr.Error(msg) try: # ── Step 1: Extract Text ───────────────────────────────────────── progress(0.1, desc="📄 Extracting text from document...") logger.info("Processing file: %s", file_path) text = extract_text(file_path) if not text or len(text.strip()) < 50: raise gr.Error( "The document contains too little text to generate audio. " "Please upload a document with more content." ) progress(0.2, desc="✅ Text extracted successfully") # ── Step 2: RAG — Chunk & Embed ────────────────────────────────── progress(0.3, desc="🧠 Processing document with AI...") rag_store.add_document(text) chunk_count = len(rag_store.chunks) logger.info("Document processed: %d chunks created", chunk_count) # ── Step 3: Retrieve Context ───────────────────────────────────── progress(0.4, desc="🔍 Retrieving key content...") # For short documents, use all chunks; for longer ones, retrieve smartly if chunk_count <= 8: context_chunks = rag_store.get_all_chunks() else: context_chunks = rag_store.query( "What are the main topics, key insights, and important details?", top_k=6, ) progress(0.5, desc="✅ Context retrieved") # ── Step 4: Generate Script ────────────────────────────────────── progress(0.6, desc="✍️ Writing spoken script...") script = generate_script(context_chunks) logger.info("Script generated: %d characters", len(script)) progress(0.75, desc="✅ Script ready") # ── Step 5: Generate Audio ─────────────────────────────────────── progress(0.8, desc="🎙️ Generating expressive audio...") audio_path, engine = generate_audio(script) logger.info("Audio generated via %s: %s", engine, audio_path) progress(1.0, desc="✅ Audio ready!") # ── Build status message ───────────────────────────────────────── status = ( f"✅ **Generation complete!**\n\n" f"- 📄 Document: {os.path.basename(file_path)}\n" f"- 📝 Text extracted: {len(text):,} characters\n" f"- 🧩 Chunks created: {chunk_count}\n" f"- ✍️ Script length: {len(script):,} characters\n" f"- 🎙️ Voice engine: {engine}\n" ) return script, audio_path, status except gr.Error: raise # Re-raise Gradio errors as-is except EnvironmentError as e: raise gr.Error(str(e)) except Exception as e: error_msg = format_error("pipeline", e) raise gr.Error(error_msg) # ── Gradio UI ──────────────────────────────────────────────────────────────── def build_ui() -> gr.Blocks: """Build and return the Gradio Blocks interface.""" # Custom CSS for a clean, polished look css = """ .main-header { text-align: center; margin-bottom: 1rem; } .main-header h1 { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-size: 2.5rem; font-weight: 800; margin-bottom: 0.25rem; } .main-header p { color: #6b7280; font-size: 1.1rem; } .status-box { border-left: 3px solid #667eea; padding-left: 1rem; margin: 0.5rem 0; } """ with gr.Blocks( title="VoiceVerse AI — Document to Audio", theme=gr.themes.Soft( primary_hue="indigo", secondary_hue="purple", ), css=css, ) as app: # ── Header ─────────────────────────────────────────────────────── gr.HTML("""

🎙️ VoiceVerse AI

Transform your documents into engaging podcast-style audio

""") with gr.Row(): # ── Left Column: Input ─────────────────────────────────────── with gr.Column(scale=1): gr.Markdown("### 📤 Upload Document") file_input = gr.File( label="Upload a PDF or TXT file", file_types=[".pdf", ".txt"], type="filepath", elem_id="file-upload", ) generate_btn = gr.Button( "🎙️ Generate Audio", variant="primary", size="lg", elem_id="generate-btn", ) status_output = gr.Markdown( value="*Upload a document and click Generate to start.*", elem_classes=["status-box"], ) # ── Right Column: Output ───────────────────────────────────── with gr.Column(scale=1): gr.Markdown("### 🎧 Generated Audio") audio_output = gr.Audio( label="Audio Narration", type="filepath", elem_id="audio-player", interactive=False, ) gr.Markdown("### ✍️ Generated Script") script_output = gr.Textbox( label="Spoken Script", lines=12, max_lines=20, interactive=False, placeholder="The generated script will appear here...", elem_id="script-display", ) # ── Wire up the generate button ────────────────────────────────── generate_btn.click( fn=process_document, inputs=[file_input], outputs=[script_output, audio_output, status_output], ) # ── Footer ─────────────────────────────────────────────────────── gr.Markdown( "
" "Built with ❤️ using Mistral-7B-Instruct · Qwen3-TTS · Edge-TTS · Gradio" "
" ) return app # ── Entry Point ────────────────────────────────────────────────────────────── if __name__ == "__main__": logger.info("Starting VoiceVerse AI...") app = build_ui() app.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True, )