""" VoiceVerse AI — Voice Generation Module (TTS). Converts generated scripts into emotionally expressive audio. Primary: Qwen3-TTS via HF Inference API (expressive, emotional) Fallback: Edge-TTS (Microsoft neural voices, CPU-only, reliable) Design decisions: - Qwen3-TTS is called through the Inference API (needs GPU, can't run locally on free tier) - Edge-TTS is the demo-safe fallback — runs on CPU, no API key needed - Architecture accepts a voice_id parameter for future multi-voice support - Audio is saved as WAV for maximum compatibility """ import os import asyncio import tempfile from utils import logger, get_temp_filepath # ── Configuration ──────────────────────────────────────────────────────────── QWEN_TTS_MODEL = "Qwen/Qwen3-TTS" EDGE_TTS_VOICE = "en-US-AriaNeural" # Expressive female neural voice # Chunk size for TTS (too-long text can cause issues) TTS_MAX_CHARS = 3000 # ── Qwen TTS (Primary — via HF Inference API) ─────────────────────────────── def generate_audio_qwen(text: str, voice_id: str | None = None) -> str | None: """ Generate audio using Qwen3-TTS via the HF Inference API. Args: text: The script text to convert to speech voice_id: Reserved for future multi-voice support Returns: Path to the generated audio file, or None if failed """ token = os.environ.get("HF_TOKEN") if not token: logger.warning("HF_TOKEN not set — skipping Qwen TTS") return None try: from huggingface_hub import InferenceClient client = InferenceClient(token=token) logger.info("Calling Qwen3-TTS API (%d chars)...", len(text)) # Truncate if needed tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text # Call the TTS endpoint audio_bytes = client.text_to_speech( text=tts_text, model=QWEN_TTS_MODEL, ) if audio_bytes and len(audio_bytes) > 0: output_path = get_temp_filepath(suffix=".wav") with open(output_path, "wb") as f: f.write(audio_bytes) logger.info("Qwen TTS audio saved: %s (%d bytes)", output_path, len(audio_bytes)) return output_path else: logger.warning("Qwen TTS returned empty audio") return None except Exception as e: logger.warning("Qwen TTS failed: %s — will fall back to Edge-TTS", e) return None # ── Edge TTS (Fallback — CPU-only, no API key) ────────────────────────────── def generate_audio_edge(text: str, voice_id: str | None = None) -> str: """ Generate audio using Edge-TTS (Microsoft neural voices). Runs entirely on CPU, no API key required. Args: text: The script text to convert to speech voice_id: Edge-TTS voice name (default: en-US-AriaNeural) Returns: Path to the generated audio file """ import edge_tts voice = voice_id or EDGE_TTS_VOICE output_path = get_temp_filepath(suffix=".mp3") # Truncate if needed tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text logger.info("Generating audio via Edge-TTS (voice: %s, %d chars)...", voice, len(tts_text)) # Edge-TTS is async, so we need to run it in an event loop async def _generate(): communicate = edge_tts.Communicate(tts_text, voice) await communicate.save(output_path) # Handle event loop — works whether called from sync or async context try: loop = asyncio.get_event_loop() if loop.is_running(): # We're inside an existing event loop (e.g., Gradio) import concurrent.futures with concurrent.futures.ThreadPoolExecutor() as executor: future = executor.submit(asyncio.run, _generate()) future.result(timeout=120) else: loop.run_until_complete(_generate()) except RuntimeError: asyncio.run(_generate()) file_size = os.path.getsize(output_path) logger.info("Edge-TTS audio saved: %s (%d bytes)", output_path, file_size) if file_size == 0: raise RuntimeError("Edge-TTS generated an empty audio file") return output_path # ── Unified Interface ──────────────────────────────────────────────────────── def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]: """ Generate audio from text, trying Qwen TTS first, falling back to Edge-TTS. Args: text: The script text to convert to speech voice_id: Optional voice identifier Returns: Tuple of (audio_file_path, engine_used) """ if not text or not text.strip(): raise ValueError("No text provided for audio generation.") # Try Qwen TTS first (expressive, emotional) logger.info("Attempting Qwen3-TTS (primary)...") audio_path = generate_audio_qwen(text, voice_id) if audio_path and os.path.exists(audio_path): return audio_path, "Qwen3-TTS" # Fall back to Edge-TTS (reliable, CPU-only) logger.info("Falling back to Edge-TTS...") audio_path = generate_audio_edge(text, voice_id) return audio_path, "Edge-TTS"