Spaces:
Running
Running
File size: 10,956 Bytes
25e6afd 54b5712 25e6afd 45e9602 25e6afd 45e9602 25e6afd 45e9602 25e6afd 45e9602 25e6afd 45e9602 25e6afd 54b5712 25e6afd 54b5712 25e6afd 54b5712 25e6afd 54b5712 25e6afd 54b5712 25e6afd 54b5712 45e9602 54b5712 45e9602 54b5712 25e6afd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 | import asyncio
import edge_tts
import pygame
import os
import logging
import shutil
import subprocess
from typing import List, Dict, Optional
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("EdgeTTS")
class EdgeTextToSpeech:
def __init__(self):
"""Initialize Microsoft Edge TTS."""
# Always set a default voice regardless of audio device availability
self.current_voice = "en-US-AriaNeural"
self.mixer_available = False
try:
pygame.mixer.init(frequency=22050, size=-16, channels=2, buffer=512)
self.mixer_available = True
logger.info("Edge TTS engine initialized successfully (pygame mixer ready)")
except Exception as e:
# In containers there is no audio device; this is expected. We can still save audio files.
logger.warning(f"Pygame mixer not available (no audio device). File generation will still work. Details: {e}")
async def speak_async(self, text: str, voice: str = None):
"""Convert text to speech using Edge TTS (async)."""
try:
voice_to_use = voice or self.current_voice
logger.info(f"Speaking with {voice_to_use}: {text}")
# Create TTS communication
communicate = edge_tts.Communicate(text, voice_to_use)
# Save to temporary file in current directory
temp_filename = f"temp_edge_audio_{hash(text) % 10000}.mp3"
await communicate.save(temp_filename)
# Play the audio file if a mixer is available (local/dev only)
if self.mixer_available and pygame.mixer.get_init():
pygame.mixer.music.load(temp_filename)
pygame.mixer.music.play()
# Wait for playback to finish
while pygame.mixer.music.get_busy():
await asyncio.sleep(0.1)
# Clean up temporary file
try:
os.remove(temp_filename)
except:
pass
return True
except Exception as e:
logger.error(f"Error in Edge TTS: {e}")
# In server/container contexts, skip local playback fallback.
return False
def speak(self, text: str, voice: str = None):
"""Synchronous wrapper for speak_async."""
return asyncio.run(self.speak_async(text, voice))
async def save_audio_async(self, text: str, filename: str, voice: str = None) -> Optional[str]:
"""Save text-to-speech audio to a file and return the actual saved path.
Primary: Edge TTS (mp3). Fallback: pyttsx3 -> wav, then convert to mp3 if ffmpeg is available.
Returns:
str: Absolute path to the saved file (mp3 or wav) on success.
None: on failure.
"""
try:
voice_to_use = voice or self.current_voice
# Ensure target directory exists
os.makedirs(os.path.dirname(filename), exist_ok=True)
# Prefer mp3 when using Edge TTS
target_path = filename
if not target_path.lower().endswith('.mp3'):
target_path = f"{os.path.splitext(filename)[0]}.mp3"
# Try Edge TTS first
communicate = edge_tts.Communicate(text, voice_to_use)
await communicate.save(target_path)
# Ensure file is fully written
self._wait_for_file(target_path)
logger.info(f"Audio saved to: {target_path}")
return target_path
except Exception as e:
logger.error(f"Error saving audio with Edge TTS: {e}")
# Fallback to local TTS: save WAV via pyttsx3
try:
wav_path = f"{os.path.splitext(filename)[0]}.wav"
await self._fallback_save_wav_async(text, wav_path)
self._wait_for_file(wav_path)
# Convert to mp3 if ffmpeg exists
if self._has_ffmpeg():
mp3_path = f"{os.path.splitext(filename)[0]}.mp3"
self._ffmpeg_wav_to_mp3(wav_path, mp3_path)
self._wait_for_file(mp3_path)
# Remove the intermediate wav
try:
os.remove(wav_path)
except Exception:
pass
logger.info(f"Audio saved to: {mp3_path} (fallback via ffmpeg)")
return mp3_path
# If no ffmpeg, keep WAV
logger.info(f"Audio saved to: {wav_path} (fallback WAV; ffmpeg not found)")
return wav_path
except Exception as fe:
logger.error(f"Fallback TTS save failed: {fe}")
return None
def save_audio(self, text: str, filename: str, voice: str = None):
"""Synchronous wrapper for save_audio_async.
Returns the actual path to the saved file or None.
"""
return asyncio.run(self.save_audio_async(text, filename, voice))
async def get_available_voices(self) -> List[Dict]:
"""Get list of available voices."""
try:
voices = await edge_tts.list_voices()
return voices
except Exception as e:
logger.error(f"Error getting voices: {e}")
return []
def list_voices(self):
"""List available voices (synchronous)."""
voices = asyncio.run(self.get_available_voices())
print("\nAvailable voices:")
print("-" * 50)
# Group by language
lang_groups = {}
for voice in voices:
lang = voice['Locale']
if lang not in lang_groups:
lang_groups[lang] = []
lang_groups[lang].append(voice)
# Show popular languages first
priority_langs = ['en-US', 'en-GB', 'en-AU', 'es-ES', 'fr-FR', 'de-DE', 'it-IT', 'ja-JP']
for lang in priority_langs:
if lang in lang_groups:
print(f"\n{lang}:")
for voice in lang_groups[lang][:3]: # Show first 3 voices per language
gender = voice.get('Gender', 'Unknown')
print(f" {voice['ShortName']} - {voice['FriendlyName']} ({gender})")
print(f"\n... and {len(voices)} total voices available")
return voices
async def _fallback_save_wav_async(self, text: str, wav_path: str):
"""Save speech to a WAV file using pyttsx3 in a background thread."""
def _save():
import pyttsx3
engine = pyttsx3.init()
# Prefer eSpeak NG in Linux containers
# engine.setProperty('voice', 'english') # optional
# Optional: adjust rate/voice here if needed
engine.save_to_file(text, wav_path)
engine.runAndWait()
await asyncio.to_thread(_save)
async def _fallback_play_async(self, text: str):
"""Play speech locally using pyttsx3 (blocking in thread)."""
# In container/server context, do not attempt local playback
return
def _has_ffmpeg(self) -> bool:
return shutil.which("ffmpeg") is not None
def _ffmpeg_wav_to_mp3(self, wav_path: str, mp3_path: str):
cmd = [
"ffmpeg", "-y",
"-i", wav_path,
"-vn",
"-ar", "22050",
"-ac", "2",
"-b:a", "128k",
mp3_path,
]
try:
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except Exception as e:
logger.error(f"ffmpeg conversion failed: {e}")
def _wait_for_file(self, path: str, timeout: float = 3.0):
"""Wait until a file exists and has non-zero size or timeout reached."""
import time
start = time.time()
while time.time() - start < timeout:
try:
if os.path.exists(path) and os.path.getsize(path) > 0:
return
except Exception:
pass
time.sleep(0.05)
def run_interactive_mode(self):
"""Run interactive Edge TTS mode."""
print("\n=== Microsoft Edge Text-to-Speech ===")
print("Commands:")
print(" Type text to speak it")
print(" 'voice <name>' - Change voice (e.g., 'voice en-US-JennyNeural')")
print(" 'voices' - List available voices")
print(" 'save <filename>' - Save last text to file")
print(" 'current' - Show current voice")
print(" 'quit' - Exit program")
print("=" * 38)
last_text = ""
while True:
try:
user_input = input(f"\n[{self.current_voice}] Enter text: ").strip()
if not user_input:
continue
if user_input.lower() == 'quit':
print("Goodbye!")
break
elif user_input.lower() == 'voices':
self.list_voices()
elif user_input.lower() == 'current':
print(f"Current voice: {self.current_voice}")
elif user_input.lower().startswith('voice '):
new_voice = user_input[6:].strip()
if new_voice:
self.current_voice = new_voice
print(f"Voice changed to: {new_voice}")
# Test the new voice
self.speak("Voice changed successfully", new_voice)
else:
print("Please specify a voice name")
elif user_input.lower().startswith('save '):
filename = user_input[5:].strip()
if last_text and filename:
self.save_audio(last_text, filename)
else:
print("No text to save or filename not provided")
else:
# Speak the entered text
last_text = user_input
self.speak(user_input)
except KeyboardInterrupt:
print("\nGoodbye!")
break
except Exception as e:
logger.error(f"Error in interactive mode: {e}")
if __name__ == "__main__":
# Install required packages first
print("Make sure to install required packages:")
print("pip install edge-tts pygame")
print()
# Create Edge TTS instance
tts = EdgeTextToSpeech()
# Test basic functionality
tts.speak("Hello! Microsoft Edge text to speech is working perfectly.")
# Run interactive mode
tts.run_interactive_mode() |