voiceverse-ai / tts.py
Isshi14's picture
Upload 8 files
8c369f8 verified
"""
VoiceVerse AI β€” Voice Generation Module (TTS).
Converts generated scripts into emotionally expressive audio.
Primary: Qwen3-TTS via HF Inference API (expressive, emotional)
Fallback: Edge-TTS (Microsoft neural voices, CPU-only, reliable)
Design decisions:
- Qwen3-TTS is called through the Inference API (needs GPU, can't run locally on free tier)
- Edge-TTS is the demo-safe fallback β€” runs on CPU, no API key needed
- Architecture accepts a voice_id parameter for future multi-voice support
- Audio is saved as WAV for maximum compatibility
"""
import os
import asyncio
import tempfile
from utils import logger, get_temp_filepath
# ── Configuration ────────────────────────────────────────────────────────────
QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
EDGE_TTS_VOICE = "en-US-AriaNeural" # Expressive female neural voice
# Chunk size for TTS (too-long text can cause issues)
TTS_MAX_CHARS = 3000
# ── Qwen TTS (Primary β€” via HF Inference API) ───────────────────────────────
def generate_audio_qwen(text: str, voice_id: str | None = None) -> str | None:
"""
Generate audio using Qwen3-TTS via the HF Inference API.
Args:
text: The script text to convert to speech
voice_id: Reserved for future multi-voice support
Returns:
Path to the generated audio file, or None if failed
"""
token = os.environ.get("HF_TOKEN")
if not token:
logger.warning("HF_TOKEN not set β€” skipping Qwen TTS")
return None
try:
from huggingface_hub import InferenceClient
client = InferenceClient(token=token)
logger.info("Calling Qwen3-TTS API (%d chars)...", len(text))
# Truncate if needed
tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text
# Call the TTS endpoint
audio_bytes = client.text_to_speech(
text=tts_text,
model=QWEN_TTS_MODEL,
)
if audio_bytes and len(audio_bytes) > 0:
output_path = get_temp_filepath(suffix=".wav")
with open(output_path, "wb") as f:
f.write(audio_bytes)
logger.info("Qwen TTS audio saved: %s (%d bytes)", output_path, len(audio_bytes))
return output_path
else:
logger.warning("Qwen TTS returned empty audio")
return None
except Exception as e:
logger.warning("Qwen TTS failed: %s β€” will fall back to Edge-TTS", e)
return None
# ── Edge TTS (Fallback β€” CPU-only, no API key) ──────────────────────────────
def generate_audio_edge(text: str, voice_id: str | None = None) -> str:
"""
Generate audio using Edge-TTS (Microsoft neural voices).
Runs entirely on CPU, no API key required.
Args:
text: The script text to convert to speech
voice_id: Edge-TTS voice name (default: en-US-AriaNeural)
Returns:
Path to the generated audio file
"""
import edge_tts
voice = voice_id or EDGE_TTS_VOICE
output_path = get_temp_filepath(suffix=".mp3")
# Truncate if needed
tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text
logger.info("Generating audio via Edge-TTS (voice: %s, %d chars)...", voice, len(tts_text))
# Edge-TTS is async, so we need to run it in an event loop
async def _generate():
communicate = edge_tts.Communicate(tts_text, voice)
await communicate.save(output_path)
# Handle event loop β€” works whether called from sync or async context
try:
loop = asyncio.get_event_loop()
if loop.is_running():
# We're inside an existing event loop (e.g., Gradio)
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(asyncio.run, _generate())
future.result(timeout=120)
else:
loop.run_until_complete(_generate())
except RuntimeError:
asyncio.run(_generate())
file_size = os.path.getsize(output_path)
logger.info("Edge-TTS audio saved: %s (%d bytes)", output_path, file_size)
if file_size == 0:
raise RuntimeError("Edge-TTS generated an empty audio file")
return output_path
# ── Unified Interface ────────────────────────────────────────────────────────
def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
"""
Generate audio from text, trying Qwen TTS first, falling back to Edge-TTS.
Args:
text: The script text to convert to speech
voice_id: Optional voice identifier
Returns:
Tuple of (audio_file_path, engine_used)
"""
if not text or not text.strip():
raise ValueError("No text provided for audio generation.")
# Try Qwen TTS first (expressive, emotional)
logger.info("Attempting Qwen3-TTS (primary)...")
audio_path = generate_audio_qwen(text, voice_id)
if audio_path and os.path.exists(audio_path):
return audio_path, "Qwen3-TTS"
# Fall back to Edge-TTS (reliable, CPU-only)
logger.info("Falling back to Edge-TTS...")
audio_path = generate_audio_edge(text, voice_id)
return audio_path, "Edge-TTS"