Spaces:
Sleeping
Sleeping
File size: 5,728 Bytes
8c369f8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | """
VoiceVerse AI β Voice Generation Module (TTS).
Converts generated scripts into emotionally expressive audio.
Primary: Qwen3-TTS via HF Inference API (expressive, emotional)
Fallback: Edge-TTS (Microsoft neural voices, CPU-only, reliable)
Design decisions:
- Qwen3-TTS is called through the Inference API (needs GPU, can't run locally on free tier)
- Edge-TTS is the demo-safe fallback β runs on CPU, no API key needed
- Architecture accepts a voice_id parameter for future multi-voice support
- Audio is saved as WAV for maximum compatibility
"""
import os
import asyncio
import tempfile
from utils import logger, get_temp_filepath
# ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
EDGE_TTS_VOICE = "en-US-AriaNeural" # Expressive female neural voice
# Chunk size for TTS (too-long text can cause issues)
TTS_MAX_CHARS = 3000
# ββ Qwen TTS (Primary β via HF Inference API) βββββββββββββββββββββββββββββββ
def generate_audio_qwen(text: str, voice_id: str | None = None) -> str | None:
"""
Generate audio using Qwen3-TTS via the HF Inference API.
Args:
text: The script text to convert to speech
voice_id: Reserved for future multi-voice support
Returns:
Path to the generated audio file, or None if failed
"""
token = os.environ.get("HF_TOKEN")
if not token:
logger.warning("HF_TOKEN not set β skipping Qwen TTS")
return None
try:
from huggingface_hub import InferenceClient
client = InferenceClient(token=token)
logger.info("Calling Qwen3-TTS API (%d chars)...", len(text))
# Truncate if needed
tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text
# Call the TTS endpoint
audio_bytes = client.text_to_speech(
text=tts_text,
model=QWEN_TTS_MODEL,
)
if audio_bytes and len(audio_bytes) > 0:
output_path = get_temp_filepath(suffix=".wav")
with open(output_path, "wb") as f:
f.write(audio_bytes)
logger.info("Qwen TTS audio saved: %s (%d bytes)", output_path, len(audio_bytes))
return output_path
else:
logger.warning("Qwen TTS returned empty audio")
return None
except Exception as e:
logger.warning("Qwen TTS failed: %s β will fall back to Edge-TTS", e)
return None
# ββ Edge TTS (Fallback β CPU-only, no API key) ββββββββββββββββββββββββββββββ
def generate_audio_edge(text: str, voice_id: str | None = None) -> str:
"""
Generate audio using Edge-TTS (Microsoft neural voices).
Runs entirely on CPU, no API key required.
Args:
text: The script text to convert to speech
voice_id: Edge-TTS voice name (default: en-US-AriaNeural)
Returns:
Path to the generated audio file
"""
import edge_tts
voice = voice_id or EDGE_TTS_VOICE
output_path = get_temp_filepath(suffix=".mp3")
# Truncate if needed
tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text
logger.info("Generating audio via Edge-TTS (voice: %s, %d chars)...", voice, len(tts_text))
# Edge-TTS is async, so we need to run it in an event loop
async def _generate():
communicate = edge_tts.Communicate(tts_text, voice)
await communicate.save(output_path)
# Handle event loop β works whether called from sync or async context
try:
loop = asyncio.get_event_loop()
if loop.is_running():
# We're inside an existing event loop (e.g., Gradio)
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(asyncio.run, _generate())
future.result(timeout=120)
else:
loop.run_until_complete(_generate())
except RuntimeError:
asyncio.run(_generate())
file_size = os.path.getsize(output_path)
logger.info("Edge-TTS audio saved: %s (%d bytes)", output_path, file_size)
if file_size == 0:
raise RuntimeError("Edge-TTS generated an empty audio file")
return output_path
# ββ Unified Interface ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
"""
Generate audio from text, trying Qwen TTS first, falling back to Edge-TTS.
Args:
text: The script text to convert to speech
voice_id: Optional voice identifier
Returns:
Tuple of (audio_file_path, engine_used)
"""
if not text or not text.strip():
raise ValueError("No text provided for audio generation.")
# Try Qwen TTS first (expressive, emotional)
logger.info("Attempting Qwen3-TTS (primary)...")
audio_path = generate_audio_qwen(text, voice_id)
if audio_path and os.path.exists(audio_path):
return audio_path, "Qwen3-TTS"
# Fall back to Edge-TTS (reliable, CPU-only)
logger.info("Falling back to Edge-TTS...")
audio_path = generate_audio_edge(text, voice_id)
return audio_path, "Edge-TTS"
|