Spaces:
Sleeping
Sleeping
| """ | |
| VoiceVerse AI β Voice Generation Module (TTS). | |
| Converts generated scripts into emotionally expressive audio. | |
| Primary: Qwen3-TTS via HF Inference API (expressive, emotional) | |
| Fallback: Edge-TTS (Microsoft neural voices, CPU-only, reliable) | |
| Design decisions: | |
| - Qwen3-TTS is called through the Inference API (needs GPU, can't run locally on free tier) | |
| - Edge-TTS is the demo-safe fallback β runs on CPU, no API key needed | |
| - Architecture accepts a voice_id parameter for future multi-voice support | |
| - Audio is saved as WAV for maximum compatibility | |
| """ | |
| import os | |
| import asyncio | |
| import tempfile | |
| from utils import logger, get_temp_filepath | |
| # ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| QWEN_TTS_MODEL = "Qwen/Qwen3-TTS" | |
| EDGE_TTS_VOICE = "en-US-AriaNeural" # Expressive female neural voice | |
| # Chunk size for TTS (too-long text can cause issues) | |
| TTS_MAX_CHARS = 3000 | |
| # ββ Qwen TTS (Primary β via HF Inference API) βββββββββββββββββββββββββββββββ | |
| def generate_audio_qwen(text: str, voice_id: str | None = None) -> str | None: | |
| """ | |
| Generate audio using Qwen3-TTS via the HF Inference API. | |
| Args: | |
| text: The script text to convert to speech | |
| voice_id: Reserved for future multi-voice support | |
| Returns: | |
| Path to the generated audio file, or None if failed | |
| """ | |
| token = os.environ.get("HF_TOKEN") | |
| if not token: | |
| logger.warning("HF_TOKEN not set β skipping Qwen TTS") | |
| return None | |
| try: | |
| from huggingface_hub import InferenceClient | |
| client = InferenceClient(token=token) | |
| logger.info("Calling Qwen3-TTS API (%d chars)...", len(text)) | |
| # Truncate if needed | |
| tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text | |
| # Call the TTS endpoint | |
| audio_bytes = client.text_to_speech( | |
| text=tts_text, | |
| model=QWEN_TTS_MODEL, | |
| ) | |
| if audio_bytes and len(audio_bytes) > 0: | |
| output_path = get_temp_filepath(suffix=".wav") | |
| with open(output_path, "wb") as f: | |
| f.write(audio_bytes) | |
| logger.info("Qwen TTS audio saved: %s (%d bytes)", output_path, len(audio_bytes)) | |
| return output_path | |
| else: | |
| logger.warning("Qwen TTS returned empty audio") | |
| return None | |
| except Exception as e: | |
| logger.warning("Qwen TTS failed: %s β will fall back to Edge-TTS", e) | |
| return None | |
| # ββ Edge TTS (Fallback β CPU-only, no API key) ββββββββββββββββββββββββββββββ | |
| def generate_audio_edge(text: str, voice_id: str | None = None) -> str: | |
| """ | |
| Generate audio using Edge-TTS (Microsoft neural voices). | |
| Runs entirely on CPU, no API key required. | |
| Args: | |
| text: The script text to convert to speech | |
| voice_id: Edge-TTS voice name (default: en-US-AriaNeural) | |
| Returns: | |
| Path to the generated audio file | |
| """ | |
| import edge_tts | |
| voice = voice_id or EDGE_TTS_VOICE | |
| output_path = get_temp_filepath(suffix=".mp3") | |
| # Truncate if needed | |
| tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text | |
| logger.info("Generating audio via Edge-TTS (voice: %s, %d chars)...", voice, len(tts_text)) | |
| # Edge-TTS is async, so we need to run it in an event loop | |
| async def _generate(): | |
| communicate = edge_tts.Communicate(tts_text, voice) | |
| await communicate.save(output_path) | |
| # Handle event loop β works whether called from sync or async context | |
| try: | |
| loop = asyncio.get_event_loop() | |
| if loop.is_running(): | |
| # We're inside an existing event loop (e.g., Gradio) | |
| import concurrent.futures | |
| with concurrent.futures.ThreadPoolExecutor() as executor: | |
| future = executor.submit(asyncio.run, _generate()) | |
| future.result(timeout=120) | |
| else: | |
| loop.run_until_complete(_generate()) | |
| except RuntimeError: | |
| asyncio.run(_generate()) | |
| file_size = os.path.getsize(output_path) | |
| logger.info("Edge-TTS audio saved: %s (%d bytes)", output_path, file_size) | |
| if file_size == 0: | |
| raise RuntimeError("Edge-TTS generated an empty audio file") | |
| return output_path | |
| # ββ Unified Interface ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]: | |
| """ | |
| Generate audio from text, trying Qwen TTS first, falling back to Edge-TTS. | |
| Args: | |
| text: The script text to convert to speech | |
| voice_id: Optional voice identifier | |
| Returns: | |
| Tuple of (audio_file_path, engine_used) | |
| """ | |
| if not text or not text.strip(): | |
| raise ValueError("No text provided for audio generation.") | |
| # Try Qwen TTS first (expressive, emotional) | |
| logger.info("Attempting Qwen3-TTS (primary)...") | |
| audio_path = generate_audio_qwen(text, voice_id) | |
| if audio_path and os.path.exists(audio_path): | |
| return audio_path, "Qwen3-TTS" | |
| # Fall back to Edge-TTS (reliable, CPU-only) | |
| logger.info("Falling back to Edge-TTS...") | |
| audio_path = generate_audio_edge(text, voice_id) | |
| return audio_path, "Edge-TTS" | |