Spaces:

Isshi14
/

voiceverse-ai

Sleeping

File size: 5,728 Bytes

8c369f8

"""

VoiceVerse AI — Voice Generation Module (TTS).



Converts generated scripts into emotionally expressive audio.



Primary:  Qwen3-TTS via HF Inference API (expressive, emotional)

Fallback: Edge-TTS (Microsoft neural voices, CPU-only, reliable)



Design decisions:

  - Qwen3-TTS is called through the Inference API (needs GPU, can't run locally on free tier)

  - Edge-TTS is the demo-safe fallback — runs on CPU, no API key needed

  - Architecture accepts a voice_id parameter for future multi-voice support

  - Audio is saved as WAV for maximum compatibility

"""

import os
import asyncio
import tempfile
from utils import logger, get_temp_filepath

# ── Configuration ────────────────────────────────────────────────────────────

QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
EDGE_TTS_VOICE = "en-US-AriaNeural"  # Expressive female neural voice

# Chunk size for TTS (too-long text can cause issues)
TTS_MAX_CHARS = 3000


# ── Qwen TTS (Primary — via HF Inference API) ───────────────────────────────

def generate_audio_qwen(text: str, voice_id: str | None = None) -> str | None:
    """

    Generate audio using Qwen3-TTS via the HF Inference API.



    Args:

        text: The script text to convert to speech

        voice_id: Reserved for future multi-voice support



    Returns:

        Path to the generated audio file, or None if failed

    """
    token = os.environ.get("HF_TOKEN")
    if not token:
        logger.warning("HF_TOKEN not set — skipping Qwen TTS")
        return None

    try:
        from huggingface_hub import InferenceClient

        client = InferenceClient(token=token)
        logger.info("Calling Qwen3-TTS API (%d chars)...", len(text))

        # Truncate if needed
        tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text

        # Call the TTS endpoint
        audio_bytes = client.text_to_speech(
            text=tts_text,
            model=QWEN_TTS_MODEL,
        )

        if audio_bytes and len(audio_bytes) > 0:
            output_path = get_temp_filepath(suffix=".wav")
            with open(output_path, "wb") as f:
                f.write(audio_bytes)
            logger.info("Qwen TTS audio saved: %s (%d bytes)", output_path, len(audio_bytes))
            return output_path
        else:
            logger.warning("Qwen TTS returned empty audio")
            return None

    except Exception as e:
        logger.warning("Qwen TTS failed: %s — will fall back to Edge-TTS", e)
        return None


# ── Edge TTS (Fallback — CPU-only, no API key) ──────────────────────────────

def generate_audio_edge(text: str, voice_id: str | None = None) -> str:
    """

    Generate audio using Edge-TTS (Microsoft neural voices).

    Runs entirely on CPU, no API key required.



    Args:

        text: The script text to convert to speech

        voice_id: Edge-TTS voice name (default: en-US-AriaNeural)



    Returns:

        Path to the generated audio file

    """
    import edge_tts

    voice = voice_id or EDGE_TTS_VOICE
    output_path = get_temp_filepath(suffix=".mp3")

    # Truncate if needed
    tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text

    logger.info("Generating audio via Edge-TTS (voice: %s, %d chars)...", voice, len(tts_text))

    # Edge-TTS is async, so we need to run it in an event loop
    async def _generate():
        communicate = edge_tts.Communicate(tts_text, voice)
        await communicate.save(output_path)

    # Handle event loop — works whether called from sync or async context
    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            # We're inside an existing event loop (e.g., Gradio)
            import concurrent.futures
            with concurrent.futures.ThreadPoolExecutor() as executor:
                future = executor.submit(asyncio.run, _generate())
                future.result(timeout=120)
        else:
            loop.run_until_complete(_generate())
    except RuntimeError:
        asyncio.run(_generate())

    file_size = os.path.getsize(output_path)
    logger.info("Edge-TTS audio saved: %s (%d bytes)", output_path, file_size)

    if file_size == 0:
        raise RuntimeError("Edge-TTS generated an empty audio file")

    return output_path


# ── Unified Interface ────────────────────────────────────────────────────────

def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
    """

    Generate audio from text, trying Qwen TTS first, falling back to Edge-TTS.



    Args:

        text: The script text to convert to speech

        voice_id: Optional voice identifier



    Returns:

        Tuple of (audio_file_path, engine_used)

    """
    if not text or not text.strip():
        raise ValueError("No text provided for audio generation.")

    # Try Qwen TTS first (expressive, emotional)
    logger.info("Attempting Qwen3-TTS (primary)...")
    audio_path = generate_audio_qwen(text, voice_id)

    if audio_path and os.path.exists(audio_path):
        return audio_path, "Qwen3-TTS"

    # Fall back to Edge-TTS (reliable, CPU-only)
    logger.info("Falling back to Edge-TTS...")
    audio_path = generate_audio_edge(text, voice_id)

    return audio_path, "Edge-TTS"