File size: 15,455 Bytes
ebd182e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 | """
VoiceVerse AI β TTS Module.
Primary: Qwen3-TTS via HF Inference API
Fallback: Edge-TTS (CPU, no key needed)
Voice + audio style per mode:
Summary β neutral female voice, normal rate
Podcast β HOST_1 female (AriaNeural) / HOST_2 male (GuyNeural)
Rap β male voice, faster rate (+40%), bass boost via pydub
Song β female voice, normal rate
Debate β DEBATER_A female (AriaNeural, +8%) / DEBATER_B male (GuyNeural, -5%)
Story β female voice, slow rate (-30%), long silence gaps between sentences
"""
import os
import re
import asyncio
from utils import logger, get_temp_filepath
QWEN_TTS_MODEL = "Qwen/Qwen3-TTS"
TTS_MAX_CHARS = 3000
# ββ Voice assignments βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Summary / Song / Story β single female voice
EDGE_VOICE_FEMALE = "en-US-AriaNeural"
# Podcast
EDGE_VOICE_HOST_FEMALE = "en-US-AriaNeural" # HOST_1 β female
EDGE_VOICE_HOST_MALE = "en-US-GuyNeural" # HOST_2 β male
# Rap β male voice reads the rap
EDGE_VOICE_RAP = "en-US-GuyNeural"
RAP_RATE = "+40%" # fast delivery
# Debate β use same reliable voices as podcast, just different rates
EDGE_VOICE_DEBATER_A = "en-US-AriaNeural" # female, pro β assertive
EDGE_VOICE_DEBATER_B = "en-US-GuyNeural" # male, con β skeptical
DEBATE_RATE_A = "+8%" # slightly faster
DEBATE_RATE_B = "-5%" # slightly slower, deliberate
# Story β slow, warm delivery
EDGE_VOICE_STORY = "en-US-AriaNeural"
STORY_RATE = "-30%" # noticeably slower
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Low-level TTS helpers
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _qwen_tts(text: str) -> str | None:
token = os.environ.get("HF_TOKEN")
if not token:
return None
try:
from huggingface_hub import InferenceClient
client = InferenceClient(token=token)
audio_bytes = client.text_to_speech(text=text[:TTS_MAX_CHARS], model=QWEN_TTS_MODEL)
if not audio_bytes:
return None
path = get_temp_filepath(suffix=".wav")
with open(path, "wb") as f:
f.write(audio_bytes)
logger.info("Qwen TTS: %s (%d bytes)", path, len(audio_bytes))
return path
except Exception as e:
logger.warning("Qwen TTS failed: %s", e)
return None
def _edge_tts(text: str, voice: str = EDGE_VOICE_FEMALE, rate: str = "+0%", pitch: str = "+0Hz") -> str:
"""
Generate audio via Edge-TTS.
rate: SSML prosody rate string, e.g. "+40%" faster, "-30%" slower.
pitch: SSML prosody pitch string, e.g. "+50Hz" higher, "-50Hz" lower.
"""
import edge_tts
path = get_temp_filepath(suffix=".mp3")
snippet = text[:TTS_MAX_CHARS]
async def _run():
communicate = edge_tts.Communicate(snippet, voice, rate=rate, pitch=pitch)
await communicate.save(path)
try:
loop = asyncio.get_event_loop()
if loop.is_running():
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as pool:
pool.submit(asyncio.run, _run()).result(timeout=120)
else:
loop.run_until_complete(_run())
except RuntimeError:
asyncio.run(_run())
if os.path.getsize(path) == 0:
raise RuntimeError("Edge-TTS produced an empty audio file.")
logger.info("Edge-TTS: %s (voice=%s rate=%s)", path, voice, rate)
return path
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Audio post-processing
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _apply_rap_fx(path: str) -> str:
"""
Apply bass boost to a rap audio file using pydub.
Low-frequency boost makes it sound punchier and more rap-like.
Returns path to processed file (new file).
"""
try:
from pydub import AudioSegment
from pydub.effects import low_pass_filter
audio = AudioSegment.from_file(path)
# Split into bass (low) and mid/high frequencies
bass = low_pass_filter(audio, 200) # frequencies below 200 Hz
highs = audio - low_pass_filter(audio, 200) # everything above
# Boost bass by 10 dB for a punchier feel, keep highs as-is, combine
boosted = (bass + 10).overlay(highs)
out = get_temp_filepath(suffix=".mp3")
boosted.export(out, format="mp3")
logger.info("Rap bass boost applied β %s", out)
return out
except Exception as e:
logger.warning("Rap FX failed (%s) β returning original audio", e)
return path
def _concat(paths: list[str], silence_ms: int = 300) -> str:
"""Concatenate audio files with silence between each segment."""
if len(paths) == 1:
return paths[0]
try:
from pydub import AudioSegment
combined = AudioSegment.empty()
silence = AudioSegment.silent(duration=silence_ms)
for p in paths:
combined += AudioSegment.from_file(p) + silence
out = get_temp_filepath(suffix=".mp3")
combined.export(out, format="mp3")
logger.info("Concatenated %d segments β %s", len(paths), out)
return out
except Exception as e:
logger.warning("pydub concat failed (%s) β trying ffmpeg fallback", e)
return _concat_ffmpeg(paths)
def _concat_ffmpeg(paths: list[str]) -> str:
"""Fallback: concatenate audio files using ffmpeg directly via subprocess."""
import subprocess
import tempfile
out = get_temp_filepath(suffix=".mp3")
# Write a concat list file for ffmpeg
list_path = get_temp_filepath(suffix=".txt")
with open(list_path, "w") as f:
for p in paths:
f.write(f"file '{p}'\n")
try:
subprocess.run(
["ffmpeg", "-y", "-f", "concat", "-safe", "0",
"-i", list_path, "-c", "copy", out],
check=True, capture_output=True, timeout=120,
)
logger.info("ffmpeg concat: %d segments β %s", len(paths), out)
return out
except Exception as e2:
logger.warning("ffmpeg concat also failed (%s) β returning first segment", e2)
return paths[0]
def _add_story_gaps(path: str) -> str:
"""
Insert longer silence gaps between sentences in story audio.
Gives the warm, unhurried feel of a storyteller.
"""
try:
from pydub import AudioSegment
audio = AudioSegment.from_file(path)
gap = AudioSegment.silent(duration=600) # 600 ms between sentences
# Split on natural pauses (every ~5 seconds of audio) and re-join with gaps
chunk_ms = 5000
chunks = [audio[i:i + chunk_ms] for i in range(0, len(audio), chunk_ms)]
combined = AudioSegment.empty()
for chunk in chunks:
combined += chunk + gap
out = get_temp_filepath(suffix=".mp3")
combined.export(out, format="mp3")
logger.info("Story gaps applied β %s", out)
return out
except Exception as e:
logger.warning("Story gap insertion failed (%s) β returning original", e)
return path
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Dialogue script parser
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _parse_dialogue(script: str, tag_a: str, tag_b: str) -> list[tuple[str, str]]:
"""Parse a HOST_X / DEBATER_X tagged script into (speaker, text) segments."""
segments: list[tuple[str, str]] = []
prefix_a = f"{tag_a}:"
prefix_b = f"{tag_b}:"
for line in script.splitlines():
line = line.strip()
if line.startswith(prefix_a):
text = line[len(prefix_a):].strip()
if text:
if segments and segments[-1][0] == tag_a:
segments[-1] = (tag_a, segments[-1][1] + " " + text)
else:
segments.append((tag_a, text))
elif line.startswith(prefix_b):
text = line[len(prefix_b):].strip()
if text:
if segments and segments[-1][0] == tag_b:
segments[-1] = (tag_b, segments[-1][1] + " " + text)
else:
segments.append((tag_b, text))
return segments
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Per-mode audio generators
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def generate_audio_podcast(script: str) -> tuple[str, str]:
"""
Podcast: ALEX = female (AriaNeural), SAM = male (GuyNeural).
Normal conversational rate, 300 ms silence between turns.
"""
segments = _parse_dialogue(script, "ALEX", "SAM")
if not segments:
logger.warning("No ALEX/SAM tags β falling back to single voice")
return generate_audio(script)
voice_map = {
"ALEX": (EDGE_VOICE_HOST_FEMALE, "+0%"),
"SAM": (EDGE_VOICE_HOST_MALE, "+0%"),
}
paths = []
for speaker, text in segments:
voice, rate = voice_map[speaker]
try:
paths.append(_edge_tts(text, voice=voice, rate=rate))
except Exception as e:
logger.warning("Podcast segment failed %s: %s", speaker, e)
if not paths:
raise RuntimeError("All podcast segments failed.")
return _concat(paths, silence_ms=300), "Edge-TTS (Podcast)"
def generate_audio_debate(script: str) -> tuple[str, str]:
"""
Debate: MAYA = female (AriaNeural, assertive +8%),
RYAN = male (GuyNeural, deliberate -5%).
400 ms silence between turns for debate feel.
"""
segments = _parse_dialogue(script, "MAYA", "RYAN")
if not segments:
logger.warning("No MAYA/RYAN tags β falling back to single voice")
return generate_audio(script)
voice_map = {
"MAYA": (EDGE_VOICE_DEBATER_A, DEBATE_RATE_A),
"RYAN": (EDGE_VOICE_DEBATER_B, DEBATE_RATE_B),
}
paths = []
for speaker, text in segments:
voice, rate = voice_map[speaker]
try:
paths.append(_edge_tts(text, voice=voice, rate=rate))
except Exception as e:
logger.warning("Debate segment failed %s: %s", speaker, e)
if not paths:
raise RuntimeError("All debate segments failed.")
return _concat(paths, silence_ms=400), "Edge-TTS (Debate)"
def generate_audio_rap(script: str) -> tuple[str, str]:
"""
Rap: TTS each line separately with short pauses for rhythm,
then concatenate and apply bass boost for a punchier sound.
"""
# Split into non-empty lines for line-by-line TTS
lines = [ln.strip() for ln in script.splitlines() if ln.strip()]
if len(lines) <= 1:
# Very short rap β just TTS the whole thing
path = _edge_tts(script, voice=EDGE_VOICE_RAP, rate=RAP_RATE)
path = _apply_rap_fx(path)
return path, "Edge-TTS (Rap)"
# TTS each line separately
paths = []
for line in lines:
try:
paths.append(_edge_tts(line, voice=EDGE_VOICE_RAP, rate=RAP_RATE))
except Exception as e:
logger.warning("Rap line TTS failed: %s", e)
if not paths:
raise RuntimeError("All rap line TTS failed.")
# Concatenate with short pauses (200ms between lines for rhythmic feel)
combined = _concat(paths, silence_ms=200)
# Apply bass boost
combined = _apply_rap_fx(combined)
return combined, "Edge-TTS (Rap)"
def generate_audio_story(script: str) -> tuple[str, str]:
"""
Story: female voice, slow rate (-30%), then sentence gaps widened via pydub.
"""
path = _edge_tts(script, voice=EDGE_VOICE_STORY, rate=STORY_RATE)
path = _add_story_gaps(path)
return path, "Edge-TTS (Story)"
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Unified public interface
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def apply_pitch_shift(path: str, pitch_semitones: float) -> str:
"""
Shift pitch of an audio file by the given number of semitones using pydub.
Positive = higher pitch, negative = lower pitch.
Returns path to new file, or original if processing fails.
"""
if abs(pitch_semitones) < 0.1:
return path # no change needed
try:
from pydub import AudioSegment
audio = AudioSegment.from_file(path)
# Change sample rate to shift pitch (speed changes too, then we fix duration)
factor = 2 ** (pitch_semitones / 12.0)
new_sample_rate = int(audio.frame_rate * factor)
shifted = audio._spawn(audio.raw_data, overrides={"frame_rate": new_sample_rate})
# Restore original sample rate to fix playback speed
shifted = shifted.set_frame_rate(audio.frame_rate)
out = get_temp_filepath(suffix=".mp3")
shifted.export(out, format="mp3")
logger.info("Pitch shifted by %.1f semitones β %s", pitch_semitones, out)
return out
except Exception as e:
logger.warning("Pitch shift failed (%s) β returning original", e)
return path
def generate_audio(text: str, voice_id: str | None = None) -> tuple[str, str]:
"""Single-voice TTS for Summary and Song modes. Tries Qwen first."""
if not text or not text.strip():
raise ValueError("No text provided for audio generation.")
path = _qwen_tts(text)
if path and os.path.exists(path):
return path, "Qwen3-TTS"
return _edge_tts(text, voice=voice_id or EDGE_VOICE_FEMALE), "Edge-TTS"
|