voiceverse-ai / app.py
Isshi14's picture
Upload 8 files
7abe9d2 verified
"""
VoiceVerse AI β€” Main Application.
Gradio-based UI that orchestrates the full document-to-audio pipeline:
1. Upload PDF/TXT β†’ extract text
2. RAG: chunk, embed, retrieve relevant context
3. Generate a spoken-style script via Mistral-7B-Instruct
4. Convert script to expressive audio via Qwen TTS / Edge-TTS
5. Play audio in the browser
Entry point for Hugging Face Spaces deployment.
"""
import os
import gradio as gr
from utils import logger, validate_file, format_error
from rag import extract_text, RAGStore
from script_gen import generate_script
from tts import generate_audio
# ── Global RAG Store (single-user demo) ──────────────────────────────────────
rag_store = RAGStore()
# ── Pipeline Orchestration ───────────────────────────────────────────────────
def process_document(file, progress=gr.Progress()):
"""
Full pipeline: upload β†’ extract β†’ RAG β†’ script β†’ audio.
Args:
file: Gradio uploaded file object (has .name attribute)
Returns:
Tuple of (script_text, audio_file_path, status_message)
"""
# ── Step 0: Validate ─────────────────────────────────────────────────
if file is None:
raise gr.Error("Please upload a PDF or TXT file first.")
file_path = file.name if hasattr(file, "name") else str(file)
is_valid, msg = validate_file(file_path)
if not is_valid:
raise gr.Error(msg)
try:
# ── Step 1: Extract Text ─────────────────────────────────────────
progress(0.1, desc="πŸ“„ Extracting text from document...")
logger.info("Processing file: %s", file_path)
text = extract_text(file_path)
if not text or len(text.strip()) < 50:
raise gr.Error(
"The document contains too little text to generate audio. "
"Please upload a document with more content."
)
progress(0.2, desc="βœ… Text extracted successfully")
# ── Step 2: RAG β€” Chunk & Embed ──────────────────────────────────
progress(0.3, desc="🧠 Processing document with AI...")
rag_store.add_document(text)
chunk_count = len(rag_store.chunks)
logger.info("Document processed: %d chunks created", chunk_count)
# ── Step 3: Retrieve Context ─────────────────────────────────────
progress(0.4, desc="πŸ” Retrieving key content...")
# For short documents, use all chunks; for longer ones, retrieve smartly
if chunk_count <= 8:
context_chunks = rag_store.get_all_chunks()
else:
context_chunks = rag_store.query(
"What are the main topics, key insights, and important details?",
top_k=6,
)
progress(0.5, desc="βœ… Context retrieved")
# ── Step 4: Generate Script ──────────────────────────────────────
progress(0.6, desc="✍️ Writing spoken script...")
script = generate_script(context_chunks)
logger.info("Script generated: %d characters", len(script))
progress(0.75, desc="βœ… Script ready")
# ── Step 5: Generate Audio ───────────────────────────────────────
progress(0.8, desc="πŸŽ™οΈ Generating expressive audio...")
audio_path, engine = generate_audio(script)
logger.info("Audio generated via %s: %s", engine, audio_path)
progress(1.0, desc="βœ… Audio ready!")
# ── Build status message ─────────────────────────────────────────
status = (
f"βœ… **Generation complete!**\n\n"
f"- πŸ“„ Document: {os.path.basename(file_path)}\n"
f"- πŸ“ Text extracted: {len(text):,} characters\n"
f"- 🧩 Chunks created: {chunk_count}\n"
f"- ✍️ Script length: {len(script):,} characters\n"
f"- πŸŽ™οΈ Voice engine: {engine}\n"
)
return script, audio_path, status
except gr.Error:
raise # Re-raise Gradio errors as-is
except EnvironmentError as e:
raise gr.Error(str(e))
except Exception as e:
error_msg = format_error("pipeline", e)
raise gr.Error(error_msg)
# ── Gradio UI ────────────────────────────────────────────────────────────────
def build_ui() -> gr.Blocks:
"""Build and return the Gradio Blocks interface."""
# Custom CSS for a clean, polished look
css = """
.main-header {
text-align: center;
margin-bottom: 1rem;
}
.main-header h1 {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
font-size: 2.5rem;
font-weight: 800;
margin-bottom: 0.25rem;
}
.main-header p {
color: #6b7280;
font-size: 1.1rem;
}
.status-box {
border-left: 3px solid #667eea;
padding-left: 1rem;
margin: 0.5rem 0;
}
"""
with gr.Blocks(
title="VoiceVerse AI β€” Document to Audio",
theme=gr.themes.Soft(
primary_hue="indigo",
secondary_hue="purple",
),
css=css,
) as app:
# ── Header ───────────────────────────────────────────────────────
gr.HTML("""
<div class="main-header">
<h1>πŸŽ™οΈ VoiceVerse AI</h1>
<p>Transform your documents into engaging podcast-style audio</p>
</div>
""")
with gr.Row():
# ── Left Column: Input ───────────────────────────────────────
with gr.Column(scale=1):
gr.Markdown("### πŸ“€ Upload Document")
file_input = gr.File(
label="Upload a PDF or TXT file",
file_types=[".pdf", ".txt"],
type="filepath",
elem_id="file-upload",
)
generate_btn = gr.Button(
"πŸŽ™οΈ Generate Audio",
variant="primary",
size="lg",
elem_id="generate-btn",
)
status_output = gr.Markdown(
value="*Upload a document and click Generate to start.*",
elem_classes=["status-box"],
)
# ── Right Column: Output ─────────────────────────────────────
with gr.Column(scale=1):
gr.Markdown("### 🎧 Generated Audio")
audio_output = gr.Audio(
label="Audio Narration",
type="filepath",
elem_id="audio-player",
interactive=False,
)
gr.Markdown("### ✍️ Generated Script")
script_output = gr.Textbox(
label="Spoken Script",
lines=12,
max_lines=20,
interactive=False,
placeholder="The generated script will appear here...",
elem_id="script-display",
)
# ── Wire up the generate button ──────────────────────────────────
generate_btn.click(
fn=process_document,
inputs=[file_input],
outputs=[script_output, audio_output, status_output],
)
# ── Footer ───────────────────────────────────────────────────────
gr.Markdown(
"<center style='color: #9ca3af; margin-top: 1rem;'>"
"Built with ❀️ using Mistral-7B-Instruct · Qwen3-TTS · Edge-TTS · Gradio"
"</center>"
)
return app
# ── Entry Point ──────────────────────────────────────────────────────────────
if __name__ == "__main__":
logger.info("Starting VoiceVerse AI...")
app = build_ui()
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True,
)