File size: 12,470 Bytes
3828c7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
"""
VoiceVerse AI β€” TTS Module.

Primary:  Qwen3-TTS via HF Inference API
Fallback: Edge-TTS (CPU, no key needed)

Voice + audio style per mode:
  Summary   β€” neutral female voice, normal rate
  Podcast   β€” HOST_1 female (AriaNeural) / HOST_2 male (GuyNeural)
  Rap       β€” male voice, faster rate (+40%), bass boost via pydub
  Song      β€” female voice, normal rate
  Debate    β€” DEBATER_A female (JennyNeural, +5%) / DEBATER_B male (DavisNeural, -5%)
  Story     β€” female voice, slow rate (-30%), long silence gaps between sentences
"""

import os
import re
import asyncio
from utils import logger, get_temp_filepath

QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
TTS_MAX_CHARS  = 3000

# ── Voice assignments ─────────────────────────────────────────────────────────
# Summary / Song / Story β€” single female voice
EDGE_VOICE_FEMALE        = "en-US-AriaNeural"

# Podcast
EDGE_VOICE_HOST_FEMALE   = "en-US-AriaNeural"    # HOST_1 β€” female
EDGE_VOICE_HOST_MALE     = "en-US-GuyNeural"     # HOST_2 β€” male

# Rap β€” male voice reads the rap
EDGE_VOICE_RAP           = "en-US-GuyNeural"
RAP_RATE                 = "+40%"                 # fast delivery

# Debate
EDGE_VOICE_DEBATER_A     = "en-US-JennyNeural"   # female, pro β€” assertive
EDGE_VOICE_DEBATER_B     = "en-US-DavisNeural"   # male, con  β€” skeptical
DEBATE_RATE_A            = "+8%"                  # slightly faster
DEBATE_RATE_B            = "-5%"                  # slightly slower, deliberate

# Story β€” slow, warm delivery
EDGE_VOICE_STORY         = "en-US-AriaNeural"
STORY_RATE               = "-30%"                 # noticeably slower


# ══════════════════════════════════════════════════════════════════════════════
# Low-level TTS helpers
# ══════════════════════════════════════════════════════════════════════════════

def _qwen_tts(text: str) -> str | None:
    token = os.environ.get("HF_TOKEN")
    if not token:
        return None
    try:
        from huggingface_hub import InferenceClient
        client = InferenceClient(token=token)
        audio_bytes = client.text_to_speech(text=text[:TTS_MAX_CHARS], model=QWEN_TTS_MODEL)
        if not audio_bytes:
            return None
        path = get_temp_filepath(suffix=".wav")
        with open(path, "wb") as f:
            f.write(audio_bytes)
        logger.info("Qwen TTS: %s (%d bytes)", path, len(audio_bytes))
        return path
    except Exception as e:
        logger.warning("Qwen TTS failed: %s", e)
        return None


def _edge_tts(text: str, voice: str = EDGE_VOICE_FEMALE, rate: str = "+0%") -> str:
    """
    Generate audio via Edge-TTS.
    rate: SSML prosody rate string, e.g. "+40%" faster, "-30%" slower.
    """
    import edge_tts
    path = get_temp_filepath(suffix=".mp3")
    snippet = text[:TTS_MAX_CHARS]

    async def _run():
        communicate = edge_tts.Communicate(snippet, voice, rate=rate)
        await communicate.save(path)

    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            import concurrent.futures
            with concurrent.futures.ThreadPoolExecutor() as pool:
                pool.submit(asyncio.run, _run()).result(timeout=120)
        else:
            loop.run_until_complete(_run())
    except RuntimeError:
        asyncio.run(_run())

    if os.path.getsize(path) == 0:
        raise RuntimeError("Edge-TTS produced an empty audio file.")
    logger.info("Edge-TTS: %s (voice=%s rate=%s)", path, voice, rate)
    return path


# ══════════════════════════════════════════════════════════════════════════════
# Audio post-processing
# ══════════════════════════════════════════════════════════════════════════════

def _apply_rap_fx(path: str) -> str:
    """
    Apply bass boost to a rap audio file using pydub.
    Low-frequency boost makes it sound punchier and more rap-like.
    Returns path to processed file (new file).
    """
    try:
        from pydub import AudioSegment
        from pydub.effects import low_pass_filter

        audio = AudioSegment.from_file(path)

        # Split into bass (low) and mid/high frequencies
        bass  = low_pass_filter(audio, 200)          # frequencies below 200 Hz
        highs = audio - low_pass_filter(audio, 200)  # everything above

        # Boost bass by 6 dB, keep highs as-is, combine
        boosted = (bass + 6).overlay(highs)

        out = get_temp_filepath(suffix=".mp3")
        boosted.export(out, format="mp3")
        logger.info("Rap bass boost applied β†’ %s", out)
        return out
    except Exception as e:
        logger.warning("Rap FX failed (%s) β€” returning original audio", e)
        return path


def _concat(paths: list[str], silence_ms: int = 300) -> str:
    """Concatenate audio files with silence between each segment."""
    if len(paths) == 1:
        return paths[0]
    try:
        from pydub import AudioSegment
        combined = AudioSegment.empty()
        silence  = AudioSegment.silent(duration=silence_ms)
        for p in paths:
            combined += AudioSegment.from_file(p) + silence
        out = get_temp_filepath(suffix=".mp3")
        combined.export(out, format="mp3")
        logger.info("Concatenated %d segments β†’ %s", len(paths), out)
        return out
    except Exception as e:
        logger.warning("pydub concat failed (%s) β€” returning first segment", e)
        return paths[0]


def _add_story_gaps(path: str) -> str:
    """
    Insert longer silence gaps between sentences in story audio.
    Gives the warm, unhurried feel of a storyteller.
    """
    try:
        from pydub import AudioSegment
        audio   = AudioSegment.from_file(path)
        gap     = AudioSegment.silent(duration=600)   # 600 ms between sentences
        # Split on natural pauses (every ~5 seconds of audio) and re-join with gaps
        chunk_ms = 5000
        chunks   = [audio[i:i + chunk_ms] for i in range(0, len(audio), chunk_ms)]
        combined = AudioSegment.empty()
        for chunk in chunks:
            combined += chunk + gap
        out = get_temp_filepath(suffix=".mp3")
        combined.export(out, format="mp3")
        logger.info("Story gaps applied β†’ %s", out)
        return out
    except Exception as e:
        logger.warning("Story gap insertion failed (%s) β€” returning original", e)
        return path


# ══════════════════════════════════════════════════════════════════════════════
# Dialogue script parser
# ══════════════════════════════════════════════════════════════════════════════

def _parse_dialogue(script: str, tag_a: str, tag_b: str) -> list[tuple[str, str]]:
    """Parse a HOST_X / DEBATER_X tagged script into (speaker, text) segments."""
    segments: list[tuple[str, str]] = []
    prefix_a = f"{tag_a}:"
    prefix_b = f"{tag_b}:"

    for line in script.splitlines():
        line = line.strip()
        if line.startswith(prefix_a):
            text = line[len(prefix_a):].strip()
            if text:
                if segments and segments[-1][0] == tag_a:
                    segments[-1] = (tag_a, segments[-1][1] + " " + text)
                else:
                    segments.append((tag_a, text))
        elif line.startswith(prefix_b):
            text = line[len(prefix_b):].strip()
            if text:
                if segments and segments[-1][0] == tag_b:
                    segments[-1] = (tag_b, segments[-1][1] + " " + text)
                else:
                    segments.append((tag_b, text))
    return segments


# ══════════════════════════════════════════════════════════════════════════════
# Per-mode audio generators
# ══════════════════════════════════════════════════════════════════════════════

def generate_audio_podcast(script: str) -> tuple[str, str]:
    """
    Podcast: HOST_1 = female (AriaNeural), HOST_2 = male (GuyNeural).
    Normal conversational rate, 300 ms silence between turns.
    """
    segments = _parse_dialogue(script, "HOST_1", "HOST_2")
    if not segments:
        logger.warning("No HOST tags β€” falling back to single voice")
        return generate_audio(script)

    voice_map = {
        "HOST_1": (EDGE_VOICE_HOST_FEMALE, "+0%"),
        "HOST_2": (EDGE_VOICE_HOST_MALE,   "+0%"),
    }
    paths = []
    for speaker, text in segments:
        voice, rate = voice_map[speaker]
        try:
            paths.append(_edge_tts(text, voice=voice, rate=rate))
        except Exception as e:
            logger.warning("Podcast segment failed %s: %s", speaker, e)

    if not paths:
        raise RuntimeError("All podcast segments failed.")
    return _concat(paths, silence_ms=300), "Edge-TTS (Podcast)"


def generate_audio_debate(script: str) -> tuple[str, str]:
    """
    Debate: DEBATER_A = female (JennyNeural, assertive +8%),
            DEBATER_B = male   (DavisNeural, deliberate -5%).
    400 ms silence between turns for debate feel.
    """
    segments = _parse_dialogue(script, "DEBATER_A", "DEBATER_B")
    if not segments:
        logger.warning("No DEBATER tags β€” falling back to single voice")
        return generate_audio(script)

    voice_map = {
        "DEBATER_A": (EDGE_VOICE_DEBATER_A, DEBATE_RATE_A),
        "DEBATER_B": (EDGE_VOICE_DEBATER_B, DEBATE_RATE_B),
    }
    paths = []
    for speaker, text in segments:
        voice, rate = voice_map[speaker]
        try:
            paths.append(_edge_tts(text, voice=voice, rate=rate))
        except Exception as e:
            logger.warning("Debate segment failed %s: %s", speaker, e)

    if not paths:
        raise RuntimeError("All debate segments failed.")
    return _concat(paths, silence_ms=400), "Edge-TTS (Debate)"


def generate_audio_rap(script: str) -> tuple[str, str]:
    """
    Rap: male voice, fast rate (+40%), then bass boost applied via pydub.
    """
    path = _edge_tts(script, voice=EDGE_VOICE_RAP, rate=RAP_RATE)
    path = _apply_rap_fx(path)
    return path, "Edge-TTS (Rap)"


def generate_audio_story(script: str) -> tuple[str, str]:
    """
    Story: female voice, slow rate (-30%), then sentence gaps widened via pydub.
    """
    path = _edge_tts(script, voice=EDGE_VOICE_STORY, rate=STORY_RATE)
    path = _add_story_gaps(path)
    return path, "Edge-TTS (Story)"


# ══════════════════════════════════════════════════════════════════════════════
# Unified public interface
# ══════════════════════════════════════════════════════════════════════════════

def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
    """Single-voice TTS for Summary and Song modes. Tries Qwen first."""
    if not text or not text.strip():
        raise ValueError("No text provided for audio generation.")
    path = _qwen_tts(text)
    if path and os.path.exists(path):
        return path, "Qwen3-TTS"
    return _edge_tts(text, voice=voice_id or EDGE_VOICE_FEMALE), "Edge-TTS"