Spaces:

Isshi14
/

voiceverse-ai

Sleeping

App Files Files Community

voiceverse-ai / tts.py

Isshi14

Upload 8 files

8c369f8 verified 3 months ago

raw

history blame contribute delete

5.73 kB

	"""
	VoiceVerse AI — Voice Generation Module (TTS).

	Converts generated scripts into emotionally expressive audio.

	Primary: Qwen3-TTS via HF Inference API (expressive, emotional)
	Fallback: Edge-TTS (Microsoft neural voices, CPU-only, reliable)

	Design decisions:
	- Qwen3-TTS is called through the Inference API (needs GPU, can't run locally on free tier)
	- Edge-TTS is the demo-safe fallback — runs on CPU, no API key needed
	- Architecture accepts a voice_id parameter for future multi-voice support
	- Audio is saved as WAV for maximum compatibility
	"""

	import os
	import asyncio
	import tempfile
	from utils import logger, get_temp_filepath

	# ── Configuration ────────────────────────────────────────────────────────────

	QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
	EDGE_TTS_VOICE = "en-US-AriaNeural" # Expressive female neural voice

	# Chunk size for TTS (too-long text can cause issues)
	TTS_MAX_CHARS = 3000


	# ── Qwen TTS (Primary — via HF Inference API) ───────────────────────────────

	def generate_audio_qwen(text: str, voice_id: str \| None = None) -> str \| None:
	"""
	Generate audio using Qwen3-TTS via the HF Inference API.

	Args:
	text: The script text to convert to speech
	voice_id: Reserved for future multi-voice support

	Returns:
	Path to the generated audio file, or None if failed
	"""
	token = os.environ.get("HF_TOKEN")
	if not token:
	logger.warning("HF_TOKEN not set — skipping Qwen TTS")
	return None

	try:
	from huggingface_hub import InferenceClient

	client = InferenceClient(token=token)
	logger.info("Calling Qwen3-TTS API (%d chars)...", len(text))

	# Truncate if needed
	tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text

	# Call the TTS endpoint
	audio_bytes = client.text_to_speech(
	text=tts_text,
	model=QWEN_TTS_MODEL,
	)

	if audio_bytes and len(audio_bytes) > 0:
	output_path = get_temp_filepath(suffix=".wav")
	with open(output_path, "wb") as f:
	f.write(audio_bytes)
	logger.info("Qwen TTS audio saved: %s (%d bytes)", output_path, len(audio_bytes))
	return output_path
	else:
	logger.warning("Qwen TTS returned empty audio")
	return None

	except Exception as e:
	logger.warning("Qwen TTS failed: %s — will fall back to Edge-TTS", e)
	return None


	# ── Edge TTS (Fallback — CPU-only, no API key) ──────────────────────────────

	def generate_audio_edge(text: str, voice_id: str \| None = None) -> str:
	"""
	Generate audio using Edge-TTS (Microsoft neural voices).
	Runs entirely on CPU, no API key required.

	Args:
	text: The script text to convert to speech
	voice_id: Edge-TTS voice name (default: en-US-AriaNeural)

	Returns:
	Path to the generated audio file
	"""
	import edge_tts

	voice = voice_id or EDGE_TTS_VOICE
	output_path = get_temp_filepath(suffix=".mp3")

	# Truncate if needed
	tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text

	logger.info("Generating audio via Edge-TTS (voice: %s, %d chars)...", voice, len(tts_text))

	# Edge-TTS is async, so we need to run it in an event loop
	async def _generate():
	communicate = edge_tts.Communicate(tts_text, voice)
	await communicate.save(output_path)

	# Handle event loop — works whether called from sync or async context
	try:
	loop = asyncio.get_event_loop()
	if loop.is_running():
	# We're inside an existing event loop (e.g., Gradio)
	import concurrent.futures
	with concurrent.futures.ThreadPoolExecutor() as executor:
	future = executor.submit(asyncio.run, _generate())
	future.result(timeout=120)
	else:
	loop.run_until_complete(_generate())
	except RuntimeError:
	asyncio.run(_generate())

	file_size = os.path.getsize(output_path)
	logger.info("Edge-TTS audio saved: %s (%d bytes)", output_path, file_size)

	if file_size == 0:
	raise RuntimeError("Edge-TTS generated an empty audio file")

	return output_path


	# ── Unified Interface ────────────────────────────────────────────────────────

	def generate_audio(text: str, voice_id: str \| None = None) -> tuple[str, str]:
	"""
	Generate audio from text, trying Qwen TTS first, falling back to Edge-TTS.

	Args:
	text: The script text to convert to speech
	voice_id: Optional voice identifier

	Returns:
	Tuple of (audio_file_path, engine_used)
	"""
	if not text or not text.strip():
	raise ValueError("No text provided for audio generation.")

	# Try Qwen TTS first (expressive, emotional)
	logger.info("Attempting Qwen3-TTS (primary)...")
	audio_path = generate_audio_qwen(text, voice_id)

	if audio_path and os.path.exists(audio_path):
	return audio_path, "Qwen3-TTS"

	# Fall back to Edge-TTS (reliable, CPU-only)
	logger.info("Falling back to Edge-TTS...")
	audio_path = generate_audio_edge(text, voice_id)

	return audio_path, "Edge-TTS"