File size: 5,728 Bytes
8c369f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""

VoiceVerse AI β€” Voice Generation Module (TTS).



Converts generated scripts into emotionally expressive audio.



Primary:  Qwen3-TTS via HF Inference API (expressive, emotional)

Fallback: Edge-TTS (Microsoft neural voices, CPU-only, reliable)



Design decisions:

  - Qwen3-TTS is called through the Inference API (needs GPU, can't run locally on free tier)

  - Edge-TTS is the demo-safe fallback β€” runs on CPU, no API key needed

  - Architecture accepts a voice_id parameter for future multi-voice support

  - Audio is saved as WAV for maximum compatibility

"""

import os
import asyncio
import tempfile
from utils import logger, get_temp_filepath

# ── Configuration ────────────────────────────────────────────────────────────

QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
EDGE_TTS_VOICE = "en-US-AriaNeural"  # Expressive female neural voice

# Chunk size for TTS (too-long text can cause issues)
TTS_MAX_CHARS = 3000


# ── Qwen TTS (Primary β€” via HF Inference API) ───────────────────────────────

def generate_audio_qwen(text: str, voice_id: str | None = None) -> str | None:
    """

    Generate audio using Qwen3-TTS via the HF Inference API.



    Args:

        text: The script text to convert to speech

        voice_id: Reserved for future multi-voice support



    Returns:

        Path to the generated audio file, or None if failed

    """
    token = os.environ.get("HF_TOKEN")
    if not token:
        logger.warning("HF_TOKEN not set β€” skipping Qwen TTS")
        return None

    try:
        from huggingface_hub import InferenceClient

        client = InferenceClient(token=token)
        logger.info("Calling Qwen3-TTS API (%d chars)...", len(text))

        # Truncate if needed
        tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text

        # Call the TTS endpoint
        audio_bytes = client.text_to_speech(
            text=tts_text,
            model=QWEN_TTS_MODEL,
        )

        if audio_bytes and len(audio_bytes) > 0:
            output_path = get_temp_filepath(suffix=".wav")
            with open(output_path, "wb") as f:
                f.write(audio_bytes)
            logger.info("Qwen TTS audio saved: %s (%d bytes)", output_path, len(audio_bytes))
            return output_path
        else:
            logger.warning("Qwen TTS returned empty audio")
            return None

    except Exception as e:
        logger.warning("Qwen TTS failed: %s β€” will fall back to Edge-TTS", e)
        return None


# ── Edge TTS (Fallback β€” CPU-only, no API key) ──────────────────────────────

def generate_audio_edge(text: str, voice_id: str | None = None) -> str:
    """

    Generate audio using Edge-TTS (Microsoft neural voices).

    Runs entirely on CPU, no API key required.



    Args:

        text: The script text to convert to speech

        voice_id: Edge-TTS voice name (default: en-US-AriaNeural)



    Returns:

        Path to the generated audio file

    """
    import edge_tts

    voice = voice_id or EDGE_TTS_VOICE
    output_path = get_temp_filepath(suffix=".mp3")

    # Truncate if needed
    tts_text = text[:TTS_MAX_CHARS] if len(text) > TTS_MAX_CHARS else text

    logger.info("Generating audio via Edge-TTS (voice: %s, %d chars)...", voice, len(tts_text))

    # Edge-TTS is async, so we need to run it in an event loop
    async def _generate():
        communicate = edge_tts.Communicate(tts_text, voice)
        await communicate.save(output_path)

    # Handle event loop β€” works whether called from sync or async context
    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            # We're inside an existing event loop (e.g., Gradio)
            import concurrent.futures
            with concurrent.futures.ThreadPoolExecutor() as executor:
                future = executor.submit(asyncio.run, _generate())
                future.result(timeout=120)
        else:
            loop.run_until_complete(_generate())
    except RuntimeError:
        asyncio.run(_generate())

    file_size = os.path.getsize(output_path)
    logger.info("Edge-TTS audio saved: %s (%d bytes)", output_path, file_size)

    if file_size == 0:
        raise RuntimeError("Edge-TTS generated an empty audio file")

    return output_path


# ── Unified Interface ────────────────────────────────────────────────────────

def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
    """

    Generate audio from text, trying Qwen TTS first, falling back to Edge-TTS.



    Args:

        text: The script text to convert to speech

        voice_id: Optional voice identifier



    Returns:

        Tuple of (audio_file_path, engine_used)

    """
    if not text or not text.strip():
        raise ValueError("No text provided for audio generation.")

    # Try Qwen TTS first (expressive, emotional)
    logger.info("Attempting Qwen3-TTS (primary)...")
    audio_path = generate_audio_qwen(text, voice_id)

    if audio_path and os.path.exists(audio_path):
        return audio_path, "Qwen3-TTS"

    # Fall back to Edge-TTS (reliable, CPU-only)
    logger.info("Falling back to Edge-TTS...")
    audio_path = generate_audio_edge(text, voice_id)

    return audio_path, "Edge-TTS"