Spaces:

IAMCB
/

tts

Sleeping

App Files Files Community

tts / tts_demo.py

IAMCB

Add application file

6890329 about 1 year ago

raw

history blame contribute delete

6.2 kB

	import torch
	from typing import Optional, Tuple, List
	from models import build_model, generate_speech, list_available_voices
	from tqdm.auto import tqdm
	import soundfile as sf
	from pathlib import Path
	import numpy as np
	import time
	import os

	# Constants
	SAMPLE_RATE = 24000
	DEFAULT_MODEL_PATH = 'kokoro-v1_0.pth'
	DEFAULT_OUTPUT_FILE = 'output.wav'
	DEFAULT_LANGUAGE = 'a' # 'a' for American English, 'b' for British English
	DEFAULT_TEXT = "Hello, welcome to this text-to-speech test."

	# Configure tqdm for better Windows console support
	tqdm.monitor_interval = 0

	def print_menu():
	"""Print the main menu options."""
	print("\n=== Kokoro TTS Menu ===")
	print("1. List available voices")
	print("2. Generate speech")
	print("3. Exit")
	return input("Select an option (1-3): ").strip()

	def select_voice(voices: List[str]) -> str:
	"""Interactive voice selection."""
	print("\nAvailable voices:")
	for i, voice in enumerate(voices, 1):
	print(f"{i}. {voice}")

	while True:
	try:
	choice = input("\nSelect a voice number (or press Enter for default 'af_bella'): ").strip()
	if not choice:
	return "af_bella"
	choice = int(choice)
	if 1 <= choice <= len(voices):
	return voices[choice - 1]
	print("Invalid choice. Please try again.")
	except ValueError:
	print("Please enter a valid number.")

	def get_text_input() -> str:
	"""Get text input from user."""
	print("\nEnter the text you want to convert to speech")
	print("(or press Enter for default text)")
	text = input("> ").strip()
	return text if text else DEFAULT_TEXT

	def get_speed() -> float:
	"""Get speech speed from user."""
	while True:
	try:
	speed = input("\nEnter speech speed (0.5-2.0, default 1.0): ").strip()
	if not speed:
	return 1.0
	speed = float(speed)
	if 0.5 <= speed <= 2.0:
	return speed
	print("Speed must be between 0.5 and 2.0")
	except ValueError:
	print("Please enter a valid number.")

	def save_audio_with_retry(audio_data: np.ndarray, sample_rate: int, output_path: Path, max_retries: int = 3, retry_delay: float = 1.0) -> bool:
	"""
	Attempt to save audio data to file with retry logic.
	Returns True if successful, False otherwise.
	"""
	for attempt in range(max_retries):
	try:
	sf.write(output_path, audio_data, sample_rate)
	return True
	except Exception as e:
	if attempt < max_retries - 1:
	print(f"\nFailed to save audio (attempt {attempt + 1}/{max_retries})")
	print("The output file might be in use by another program (e.g., media player).")
	print(f"Please close any programs that might be using '{output_path}'")
	print(f"Retrying in {retry_delay} seconds...")
	time.sleep(retry_delay)
	else:
	print(f"\nError: Could not save audio after {max_retries} attempts.")
	print(f"Please ensure '{output_path}' is not open in any other program and try again.")
	return False
	return False

	def main() -> None:
	try:
	# Set up device
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	print(f"Using device: {device}")

	# Build model
	print("\nInitializing model...")
	with tqdm(total=1, desc="Building model") as pbar:
	model = build_model(DEFAULT_MODEL_PATH, device)
	pbar.update(1)

	while True:
	choice = print_menu()

	if choice == "1":
	# List voices
	voices = list_available_voices()
	print("\nAvailable voices:")
	for voice in voices:
	print(f"- {voice}")

	elif choice == "2":
	# Generate speech
	voices = list_available_voices()
	if not voices:
	print("No voices found! Please check the voices directory.")
	continue

	# Get user inputs
	voice = select_voice(voices)
	text = get_text_input()
	speed = get_speed()

	print(f"\nGenerating speech for: '{text}'")
	print(f"Using voice: {voice}")
	print(f"Speed: {speed}x")

	# Generate speech
	all_audio = []
	generator = model(text, voice=f"voices/{voice}.pt", speed=speed, split_pattern=r'\n+')

	with tqdm(desc="Generating speech") as pbar:
	for gs, ps, audio in generator:
	if audio is not None:
	if isinstance(audio, np.ndarray):
	audio = torch.from_numpy(audio).float()
	all_audio.append(audio)
	print(f"\nGenerated segment: {gs}")
	print(f"Phonemes: {ps}")
	pbar.update(1)

	# Save audio
	if all_audio:
	final_audio = torch.cat(all_audio, dim=0)
	output_path = Path(DEFAULT_OUTPUT_FILE)
	if save_audio_with_retry(final_audio.numpy(), SAMPLE_RATE, output_path):
	print(f"\nAudio saved to {output_path.absolute()}")
	else:
	print("Error: Failed to generate audio")

	elif choice == "3":
	print("\nGoodbye!")
	break

	else:
	print("\nInvalid choice. Please try again.")

	except Exception as e:
	print(f"Error in main: {e}")
	import traceback
	traceback.print_exc()
	finally:
	# Cleanup
	if 'model' in locals():
	del model
	torch.cuda.empty_cache()

	if __name__ == "__main__":
	main()