""" VoiceVerse AI — TTS Module. Primary: Qwen3-TTS via HF Inference API Fallback: Edge-TTS (CPU, no key needed) Voice + audio style per mode: Summary — neutral female voice, normal rate Podcast — HOST_1 female (AriaNeural) / HOST_2 male (GuyNeural) Rap — male voice, faster rate (+40%), bass boost via pydub Song — female voice, normal rate Debate — DEBATER_A female (JennyNeural, +5%) / DEBATER_B male (DavisNeural, -5%) Story — female voice, slow rate (-30%), long silence gaps between sentences """ import os import re import asyncio from utils import logger, get_temp_filepath QWEN_TTS_MODEL = "Qwen/Qwen3-TTS" TTS_MAX_CHARS = 3000 # ── Voice assignments ───────────────────────────────────────────────────────── # Summary / Song / Story — single female voice EDGE_VOICE_FEMALE = "en-US-AriaNeural" # Podcast EDGE_VOICE_HOST_FEMALE = "en-US-AriaNeural" # HOST_1 — female EDGE_VOICE_HOST_MALE = "en-US-GuyNeural" # HOST_2 — male # Rap — male voice reads the rap EDGE_VOICE_RAP = "en-US-GuyNeural" RAP_RATE = "+40%" # fast delivery # Debate EDGE_VOICE_DEBATER_A = "en-US-JennyNeural" # female, pro — assertive EDGE_VOICE_DEBATER_B = "en-US-DavisNeural" # male, con — skeptical DEBATE_RATE_A = "+8%" # slightly faster DEBATE_RATE_B = "-5%" # slightly slower, deliberate # Story — slow, warm delivery EDGE_VOICE_STORY = "en-US-AriaNeural" STORY_RATE = "-30%" # noticeably slower # ══════════════════════════════════════════════════════════════════════════════ # Low-level TTS helpers # ══════════════════════════════════════════════════════════════════════════════ def _qwen_tts(text: str) -> str | None: token = os.environ.get("HF_TOKEN") if not token: return None try: from huggingface_hub import InferenceClient client = InferenceClient(token=token) audio_bytes = client.text_to_speech(text=text[:TTS_MAX_CHARS], model=QWEN_TTS_MODEL) if not audio_bytes: return None path = get_temp_filepath(suffix=".wav") with open(path, "wb") as f: f.write(audio_bytes) logger.info("Qwen TTS: %s (%d bytes)", path, len(audio_bytes)) return path except Exception as e: logger.warning("Qwen TTS failed: %s", e) return None def _edge_tts(text: str, voice: str = EDGE_VOICE_FEMALE, rate: str = "+0%") -> str: """ Generate audio via Edge-TTS. rate: SSML prosody rate string, e.g. "+40%" faster, "-30%" slower. """ import edge_tts path = get_temp_filepath(suffix=".mp3") snippet = text[:TTS_MAX_CHARS] async def _run(): communicate = edge_tts.Communicate(snippet, voice, rate=rate) await communicate.save(path) try: loop = asyncio.get_event_loop() if loop.is_running(): import concurrent.futures with concurrent.futures.ThreadPoolExecutor() as pool: pool.submit(asyncio.run, _run()).result(timeout=120) else: loop.run_until_complete(_run()) except RuntimeError: asyncio.run(_run()) if os.path.getsize(path) == 0: raise RuntimeError("Edge-TTS produced an empty audio file.") logger.info("Edge-TTS: %s (voice=%s rate=%s)", path, voice, rate) return path # ══════════════════════════════════════════════════════════════════════════════ # Audio post-processing # ══════════════════════════════════════════════════════════════════════════════ def _apply_rap_fx(path: str) -> str: """ Apply bass boost to a rap audio file using pydub. Low-frequency boost makes it sound punchier and more rap-like. Returns path to processed file (new file). """ try: from pydub import AudioSegment from pydub.effects import low_pass_filter audio = AudioSegment.from_file(path) # Split into bass (low) and mid/high frequencies bass = low_pass_filter(audio, 200) # frequencies below 200 Hz highs = audio - low_pass_filter(audio, 200) # everything above # Boost bass by 6 dB, keep highs as-is, combine boosted = (bass + 6).overlay(highs) out = get_temp_filepath(suffix=".mp3") boosted.export(out, format="mp3") logger.info("Rap bass boost applied → %s", out) return out except Exception as e: logger.warning("Rap FX failed (%s) — returning original audio", e) return path def _concat(paths: list[str], silence_ms: int = 300) -> str: """Concatenate audio files with silence between each segment.""" if len(paths) == 1: return paths[0] try: from pydub import AudioSegment combined = AudioSegment.empty() silence = AudioSegment.silent(duration=silence_ms) for p in paths: combined += AudioSegment.from_file(p) + silence out = get_temp_filepath(suffix=".mp3") combined.export(out, format="mp3") logger.info("Concatenated %d segments → %s", len(paths), out) return out except Exception as e: logger.warning("pydub concat failed (%s) — returning first segment", e) return paths[0] def _add_story_gaps(path: str) -> str: """ Insert longer silence gaps between sentences in story audio. Gives the warm, unhurried feel of a storyteller. """ try: from pydub import AudioSegment audio = AudioSegment.from_file(path) gap = AudioSegment.silent(duration=600) # 600 ms between sentences # Split on natural pauses (every ~5 seconds of audio) and re-join with gaps chunk_ms = 5000 chunks = [audio[i:i + chunk_ms] for i in range(0, len(audio), chunk_ms)] combined = AudioSegment.empty() for chunk in chunks: combined += chunk + gap out = get_temp_filepath(suffix=".mp3") combined.export(out, format="mp3") logger.info("Story gaps applied → %s", out) return out except Exception as e: logger.warning("Story gap insertion failed (%s) — returning original", e) return path # ══════════════════════════════════════════════════════════════════════════════ # Dialogue script parser # ══════════════════════════════════════════════════════════════════════════════ def _parse_dialogue(script: str, tag_a: str, tag_b: str) -> list[tuple[str, str]]: """Parse a HOST_X / DEBATER_X tagged script into (speaker, text) segments.""" segments: list[tuple[str, str]] = [] prefix_a = f"{tag_a}:" prefix_b = f"{tag_b}:" for line in script.splitlines(): line = line.strip() if line.startswith(prefix_a): text = line[len(prefix_a):].strip() if text: if segments and segments[-1][0] == tag_a: segments[-1] = (tag_a, segments[-1][1] + " " + text) else: segments.append((tag_a, text)) elif line.startswith(prefix_b): text = line[len(prefix_b):].strip() if text: if segments and segments[-1][0] == tag_b: segments[-1] = (tag_b, segments[-1][1] + " " + text) else: segments.append((tag_b, text)) return segments # ══════════════════════════════════════════════════════════════════════════════ # Per-mode audio generators # ══════════════════════════════════════════════════════════════════════════════ def generate_audio_podcast(script: str) -> tuple[str, str]: """ Podcast: HOST_1 = female (AriaNeural), HOST_2 = male (GuyNeural). Normal conversational rate, 300 ms silence between turns. """ segments = _parse_dialogue(script, "HOST_1", "HOST_2") if not segments: logger.warning("No HOST tags — falling back to single voice") return generate_audio(script) voice_map = { "HOST_1": (EDGE_VOICE_HOST_FEMALE, "+0%"), "HOST_2": (EDGE_VOICE_HOST_MALE, "+0%"), } paths = [] for speaker, text in segments: voice, rate = voice_map[speaker] try: paths.append(_edge_tts(text, voice=voice, rate=rate)) except Exception as e: logger.warning("Podcast segment failed %s: %s", speaker, e) if not paths: raise RuntimeError("All podcast segments failed.") return _concat(paths, silence_ms=300), "Edge-TTS (Podcast)" def generate_audio_debate(script: str) -> tuple[str, str]: """ Debate: DEBATER_A = female (JennyNeural, assertive +8%), DEBATER_B = male (DavisNeural, deliberate -5%). 400 ms silence between turns for debate feel. """ segments = _parse_dialogue(script, "DEBATER_A", "DEBATER_B") if not segments: logger.warning("No DEBATER tags — falling back to single voice") return generate_audio(script) voice_map = { "DEBATER_A": (EDGE_VOICE_DEBATER_A, DEBATE_RATE_A), "DEBATER_B": (EDGE_VOICE_DEBATER_B, DEBATE_RATE_B), } paths = [] for speaker, text in segments: voice, rate = voice_map[speaker] try: paths.append(_edge_tts(text, voice=voice, rate=rate)) except Exception as e: logger.warning("Debate segment failed %s: %s", speaker, e) if not paths: raise RuntimeError("All debate segments failed.") return _concat(paths, silence_ms=400), "Edge-TTS (Debate)" def generate_audio_rap(script: str) -> tuple[str, str]: """ Rap: male voice, fast rate (+40%), then bass boost applied via pydub. """ path = _edge_tts(script, voice=EDGE_VOICE_RAP, rate=RAP_RATE) path = _apply_rap_fx(path) return path, "Edge-TTS (Rap)" def generate_audio_story(script: str) -> tuple[str, str]: """ Story: female voice, slow rate (-30%), then sentence gaps widened via pydub. """ path = _edge_tts(script, voice=EDGE_VOICE_STORY, rate=STORY_RATE) path = _add_story_gaps(path) return path, "Edge-TTS (Story)" # ══════════════════════════════════════════════════════════════════════════════ # Unified public interface # ══════════════════════════════════════════════════════════════════════════════ def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]: """Single-voice TTS for Summary and Song modes. Tries Qwen first.""" if not text or not text.strip(): raise ValueError("No text provided for audio generation.") path = _qwen_tts(text) if path and os.path.exists(path): return path, "Qwen3-TTS" return _edge_tts(text, voice=voice_id or EDGE_VOICE_FEMALE), "Edge-TTS"