diff --git a/README.md b/README.md index 610b7416b30892fe81f2f368d02c95c272bfd67e..0ed03777d79a98bf0a42151b02f1428352fc46fd 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ colorFrom: purple colorTo: blue sdk: gradio sdk_version: 5.12.0 -python_version: "3.12" +python_version: "3.10" app_file: app.py pinned: false license: mit @@ -25,17 +25,17 @@ Outil web de **clonage vocal zero-shot** basé sur **Seed-VC** (Diffusion Transf ## Fonctionnalités -1. **Référence vocale** : Uploadez un court extrait de votre voix (3-30 sec) — pas d'entraînement nécessaire -2. **Séparation audio** : Séparation automatique voix/instruments via Demucs (Meta AI) +1. **Référence vocale** : Uploadez un court extrait de votre voz (3-30 sec) — pas d'entraînement nécessaire +2. **Séparation audio** : Séparation automatique voz/instruments via Demucs (Meta AI) 3. **Conversion vocale** : Remplacement de la voix originale par la vôtre (Seed-VC zero-shot) -4. **Mixage final** : Remixage automatique voix convertie + instruments originaux +4. **Mixage final** : Remixage automatique voz convertie + instruments originaux 5. **Export** : Téléchargement du résultat en WAV 44.1kHz 16-bit ## Comment utiliser -### Étape 1 : Enregistrer votre référence vocale +### Étape 1 : Enregistrer votre referencia vocale 1. Onglet **"Ma voix"** -2. Uploadez un extrait de votre voix (WAV ou MP3, 3 à 30 secondes) +2. Uploadez un extrait de votre voz (WAV ou MP3, 3 à 30 secondes) 3. Donnez un nom (ex: `ma_voix`) 4. Cliquez **"Sauvegarder"** diff --git a/pipeline/rvc_training.py b/pipeline/rvc_training.py index c482fa0ad0c8aaaca9fba72b996157d657cba7e8..241055e104e34f82791e998ec963553d17518a04 100644 --- a/pipeline/rvc_training.py +++ b/pipeline/rvc_training.py @@ -17,25 +17,24 @@ except ImportError: return fn return decorator -# Configuration for Ultimate-RVC paths -# We set these environment variables BEFORE importing ultimate_rvc to ensure it uses our paths +# Configuration for paths os.environ["URVC_MODELS_DIR"] = os.path.abspath("rvc_models") os.environ["URVC_AUDIO_DIR"] = os.path.abspath("rvc_audio") os.environ["URVC_TEMP_DIR"] = os.path.abspath("rvc_temp") -# Now we can import the core functions from ultimate_rvc +# Import the core functions from our LOCAL rvc_logic try: - from ultimate_rvc.core.train import prepare, extract, train - from ultimate_rvc.typing_extra import TrainingSampleRate, F0Method, EmbedderModel - ULTIMATE_RVC_AVAILABLE = True + from rvc_logic.core_train import prepare, extract, train + from rvc_logic.typing_extra import TrainingSampleRate, F0Method, EmbedderModel + RVC_LOGIC_AVAILABLE = True except ImportError as e: - logger.error(f"Failed to import ultimate_rvc: {e}") - ULTIMATE_RVC_AVAILABLE = False + logger.error(f"Failed to import rvc_logic: {e}") + RVC_LOGIC_AVAILABLE = False -@spaces.GPU(duration=1000) # Training takes time, let's request more +@spaces.GPU(duration=1000) def train_rvc_model(audio_path, model_name, epochs=100, progress=None): - if not ULTIMATE_RVC_AVAILABLE: - return "Error: ultimate-rvc library not installed correctly.", None + if not RVC_LOGIC_AVAILABLE: + return "Error: rvc_logic module not found in the project.", None if not audio_path: return "Error: Please upload an audio file.", None @@ -71,13 +70,12 @@ def train_rvc_model(audio_path, model_name, epochs=100, progress=None): ) # 4. Train - p(0.6, f"Step 4/4: Training for {epochs} epochs (this may take several minutes)...") - # ultimate-rvc's run_training returns [pth_path, index_path] + p(0.6, f"Step 4/4: Training for {epochs} epochs...") result_paths = train.run_training( model_name=model_name, num_epochs=epochs, - batch_size=4, # Safe for ZeroGPU - save_interval=epochs # Only save at the end + batch_size=4, + save_interval=epochs ) if not result_paths or len(result_paths) < 2: @@ -107,7 +105,7 @@ def train_rvc_model(audio_path, model_name, epochs=100, progress=None): ) except Exception as e: logger.error(f"Upload to dataset failed: {e}") - return f"Model trained but failed to upload to HF: {e}. Files are at {pth_path}", pth_path + return f"Model trained but failed to upload to HF: {e}", pth_path return f"Successfully trained and uploaded to dataset {repo_id}!", pth_path diff --git a/requirements.txt b/requirements.txt index 6c835b85de43c8da0f0300f932c298cc838e306d..459d26deaa21786cfd14118b6d7ffb9616893751 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,9 +33,9 @@ bigvgan descript-audio-codec vocos -# RVC Training via Ultimate-RVC -ultimate-rvc==0.6.0 +# RVC Training dependencies (No fairseq, using transformers/faiss/crepe) torchcrepe torchfcpe +faiss-cpu tensorboardX wget diff --git a/rvc_logic/common.py b/rvc_logic/common.py new file mode 100644 index 0000000000000000000000000000000000000000..45b106e10f05bff728bbcaf98d0612285689fad0 --- /dev/null +++ b/rvc_logic/common.py @@ -0,0 +1,37 @@ +"""Common variables used in the Ultimate RVC project.""" + +from __future__ import annotations + +import os +import sys +from pathlib import Path + +BASE_DIR = Path.cwd() +VENV_DIR = Path(sys.prefix) +MODELS_DIR = Path(os.getenv("URVC_MODELS_DIR") or BASE_DIR / "models") +RVC_MODELS_DIR = MODELS_DIR / "rvc" +VOICE_MODELS_DIR = Path( + os.getenv("URVC_VOICE_MODELS_DIR") or RVC_MODELS_DIR / "voice_models", +) +EMBEDDER_MODELS_DIR = RVC_MODELS_DIR / "embedders" +CUSTOM_EMBEDDER_MODELS_DIR = EMBEDDER_MODELS_DIR / "custom" +PRETRAINED_MODELS_DIR = RVC_MODELS_DIR / "pretraineds" +CUSTOM_PRETRAINED_MODELS_DIR = PRETRAINED_MODELS_DIR / "custom" + +SEPARATOR_MODELS_DIR = MODELS_DIR / "audio_separator" +TRAINING_MODELS_DIR = RVC_MODELS_DIR / "training" +AUDIO_DIR = Path(os.getenv("URVC_AUDIO_DIR") or BASE_DIR / "audio") +TEMP_DIR = Path(os.getenv("URVC_TEMP_DIR") or BASE_DIR / "temp") +CONFIG_DIR = Path(os.getenv("URVC_CONFIG_DIR") or BASE_DIR / "config") +NODE_PATH = Path( + ( + os.getenv("GRADIO_NODE_PATH") + or ( + VENV_DIR + / f"lib/python{sys.version_info.major}.{sys.version_info.minor}" + / "site-packages/nodejs_wheel/bin/node" + ) + if sys.platform == "linux" + else VENV_DIR / "Lib/site-packages/nodejs_wheel/node.exe" + ), +) diff --git a/rvc_logic/core_train/__init__.py b/rvc_logic/core_train/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2989b21b677afc26b8a194a1c77a95f693613ade --- /dev/null +++ b/rvc_logic/core_train/__init__.py @@ -0,0 +1,4 @@ +""" +Package which exposes definitions facilitating the training of voice +conversion models. +""" diff --git a/rvc_logic/core_train/common.py b/rvc_logic/core_train/common.py new file mode 100644 index 0000000000000000000000000000000000000000..57d2a1f36e33b4b6e4c9a59dd7fa008eb52b6df7 --- /dev/null +++ b/rvc_logic/core_train/common.py @@ -0,0 +1,105 @@ +""" +Common definitions for modules in the Ultimate RVC project that +facilitate training voice models. +""" + +from __future__ import annotations + +from typing import Literal + +from rvc_logic.core.exceptions import ( + Entity, + GPUNotFoundError, + NotProvidedError, + UIMessage, +) +from rvc_logic.typing_extra import DeviceType + + +def get_gpu_info() -> list[tuple[str, int]]: + """ + Retrieve information on locally available GPUs. + + Returns + ------- + list[tuple[str, int]] + A list of tuples containing the name and index of each locally + available GPU. + + """ + # NOTE lazy importing does not work with torch so we import it here + # manually + import torch # noqa: PLC0415 + + ngpu = torch.cuda.device_count() + gpu_infos: list[tuple[str, int]] = [] + if torch.cuda.is_available() or ngpu != 0: + for i in range(ngpu): + gpu_name = torch.cuda.get_device_name(i) + mem = int( + torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 # type: ignore[ReportUnknownMembershipType] + + 0.4, + ) + gpu_infos.append((f"{gpu_name} ({mem} GB)", i)) + return gpu_infos + + +def validate_devices( + device_type: DeviceType, + device_ids: set[int] | None = None, +) -> tuple[Literal["cuda", "cpu"], set[int] | None]: + """ + Validate the devices identified by the provided device type and + device IDs. + + If the provided device type is AUTOMATIC, the first available GPU + will be selected if available. Otherwise CPU will be selected. + If the device type is GPU, then validation will be performed to + ensure that at least one device ID is provided and that all device + IDs point to available GPUs. If the device type is CPU, then no + validation is performed. + + Parameters + ---------- + device_type : DeviceType + The type of devices to validate. + device_ids : set[int], optional + The IDs of the devices to validate when device type is GPU. + + Returns + ------- + device_type : str + The type of the selected devices. + device_ids : set[int], optional + The ids of the selected devices. Only returned when the + device type is GPU or AUTOMATIC. + + Raises + ------ + NotProvidedError + If device type is GPU and no device IDs are provided. + GPUNotFoundError + If device type is GPU and a provided device ID does not point + to an available GPU. + + + """ + match device_type: + case DeviceType.AUTOMATIC: + gpu_info = get_gpu_info() + if gpu_info: + return "cuda", {gpu_info[0][1]} + return "cpu", None + case DeviceType.GPU: + if not device_ids: + raise NotProvidedError(Entity.GPU_IDS, UIMessage.NO_GPUS) + validated_devices: list[int] = [] + available_ids = {i for _, i in get_gpu_info()} + for device_id in device_ids: + if device_id not in available_ids: + raise GPUNotFoundError(device_id) + validated_devices.append(device_id) + return "cuda", set(validated_devices) + case DeviceType.CPU: + return "cpu", None +None diff --git a/rvc_logic/core_train/extract.py b/rvc_logic/core_train/extract.py new file mode 100644 index 0000000000000000000000000000000000000000..5b5eeabc49fe06623fc30b5a5a8bb807e7400272 --- /dev/null +++ b/rvc_logic/core_train/extract.py @@ -0,0 +1,146 @@ +""" +Module which exposes functionality for extracting training features from +audio datasets. +""" + +from __future__ import annotations + +from multiprocessing import cpu_count + +from rvc_logic.core.common import ( + display_progress, + get_combined_file_hash, + validate_model, +) +from rvc_logic.core.exceptions import ( + Entity, + ModelAsssociatedEntityNotFoundError, + Step, +) +from rvc_logic.core.train.common import validate_devices +from rvc_logic.typing_extra import ( + DeviceType, + EmbedderModel, + F0Method, +) + + +def extract_features( + model_name: str, + f0_method: F0Method = F0Method.RMVPE, + embedder_model: EmbedderModel = EmbedderModel.CONTENTVEC, + custom_embedder_model: str | None = None, + include_mutes: int = 2, + cpu_cores: int = cpu_count(), + hardware_acceleration: DeviceType = DeviceType.AUTOMATIC, + gpu_ids: set[int] | None = None, +) -> None: + """ + Extract features from the preprocessed dataset associated with a + voice model to be trained. + + Parameters + ---------- + model_name : str + The name of the voice model to be trained. + f0_method : F0Method, defaultF0Method.RMVPE + The method to use for extracting pitch features. + embedder_model : EmbedderModel, default=EmbedderModel.CONTENTVEC + The model to use for extracting audio embeddings. + custom_embedder_model : StrPath, optional + The name of the custom embedder model to use for extracting + audio embeddings. + include_mutes : int, default=2 + The number of mute audio files to include in the generated + training file list. Adding silent files enables the voice model + to handle pure silence in inferred audio files. If the + preprocessed audio dataset already contains segments of pure + silence, set this to 0. + cpu_cores : int, default=cpu_count() + The number of CPU cores to use for feature extraction. + hardware_acceleration : DeviceType, default=DeviceType.AUTOMATIC + The type of hardware acceleration to use for feature extraction. + `AUTOMATIC` will select the first available GPU and fall back to + CPU if no GPUs are available. + gpu_ids : set[int], optional + Set of ids of the GPUs to use for feature extraction when `GPU` + is selected for hardware acceleration. + + Raises + ------ + ModelAsssociatedEntityNotFoundError + If no preprocessed dataset audio files are associated with the + voice model identified by the provided name. + + """ + model_path = validate_model(model_name, Entity.TRAINING_MODEL) + sliced_audios16k_path = model_path / "sliced_audios_16k" + if not sliced_audios16k_path.is_dir() or not any(sliced_audios16k_path.iterdir()): + raise ModelAsssociatedEntityNotFoundError( + Entity.PREPROCESSED_AUDIO_DATASET_FILES, + model_name, + Step.DATASET_PREPROCESSING, + ) + + custom_embedder_model_path, combined_file_hash = None, None + chosen_embedder_model, embedder_model_id = [embedder_model] * 2 + if embedder_model == EmbedderModel.CUSTOM: + custom_embedder_model_path = validate_model( + custom_embedder_model, + Entity.CUSTOM_EMBEDDER_MODEL, + ) + json_file = custom_embedder_model_path / "config.json" + bin_path = custom_embedder_model_path / "pytorch_model.bin" + + combined_file_hash = get_combined_file_hash([json_file, bin_path]) + chosen_embedder_model = str(custom_embedder_model_path) + embedder_model_id = f"custom_{combined_file_hash}" + + device_type, device_ids = validate_devices(hardware_acceleration, gpu_ids) + + devices = ( + [f"{device_type}:{device_id}" for device_id in device_ids] + if device_ids + else [device_type] + ) + # NOTE The lazy_import function does not work with the package below + # so we import it here manually + from rvc_logic.rvc.train.extract import extract # noqa: PLC0415 + + file_infos = extract.initialize_extraction( + str(model_path), + f0_method, + embedder_model_id, + ) + extract.update_model_info( + str(model_path), + chosen_embedder_model, + combined_file_hash, + ) + display_progress("[~] Extracting pitch features...") + extract.run_pitch_extraction(file_infos, devices, f0_method, cpu_cores) + display_progress("[~] Extracting audio embeddings...") + extract.run_embedding_extraction( + file_infos, + devices, + embedder_model, + ( + str(custom_embedder_model_path) + if custom_embedder_model_path is not None + else None + ), + cpu_cores, + ) + # NOTE The lazy_import function does not work with the package below + # so we import it here manually + from rvc_logic.rvc.train.extract import preparing_files # noqa: PLC0415 + + preparing_files.generate_config(str(model_path)) + preparing_files.generate_filelist( + str(model_path), + include_mutes, + f0_method, + embedder_model_id, + ) +model_id, + ) diff --git a/rvc_logic/core_train/prepare.py b/rvc_logic/core_train/prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..aa4d45a33e75eb5a82523c9ae68e5108d3191886 --- /dev/null +++ b/rvc_logic/core_train/prepare.py @@ -0,0 +1,205 @@ +""" +Module which exposes functionality for creating and preprocessing +datasets for training voice conversion models. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import lazy_loader as lazy + +import shutil +from multiprocessing import cpu_count + +from rvc_logic.common import TRAINING_MODELS_DIR +from rvc_logic.core.common import ( + TRAINING_AUDIO_DIR, + validate_audio_dir_exists, + validate_audio_file_exists, +) +from rvc_logic.core.exceptions import ( + Entity, + InvalidAudioFormatError, + NotProvidedError, + UIMessage, +) +from rvc_logic.typing_extra import ( + AudioExt, + AudioNormalizationMode, + AudioSplitMethod, + TrainingSampleRate, +) + +if TYPE_CHECKING: + from collections.abc import Sequence + from pathlib import Path + + import static_ffmpeg + + from rvc_logic.typing_extra import StrPath +else: + static_ffmpeg = lazy.load("static_ffmpeg") + + +def populate_dataset(name: str, audio_files: Sequence[StrPath]) -> Path: + """ + Populate the dataset with the provided name with the provided audio + files. + + If no dataset with the provided name exists, a new dataset with the + provided name will be created. If any of audio files already exist + in the dataset, they will be overwritten. + + Parameters + ---------- + name : str + The name of the dataset to populate. + audio_files : list[StrPath] + The audio files to populate the dataset with. + + Returns + ------- + The path to the dataset with the provided name. + + Raises + ------ + NotProvidedError + If no dataset name or no audio files are provided. + + InvalidAudioFormatError + If any of the provided audio files are not in a valid format. + + """ + if not name: + raise NotProvidedError(Entity.DATASET_NAME) + + if not audio_files: + raise NotProvidedError(Entity.FILES, ui_msg=UIMessage.NO_UPLOADED_FILES) + + static_ffmpeg.add_paths(weak=True) + + import pydub.utils as pydub_utils # noqa: PLC0415 + + audio_paths: list[Path] = [] + for audio_file in audio_files: + audio_path = validate_audio_file_exists(audio_file, Entity.FILE) + audio_info = pydub_utils.mediainfo(str(audio_file)) + if not ( + audio_info["format_name"] + in { + AudioExt.WAV, + AudioExt.FLAC, + AudioExt.MP3, + AudioExt.OGG, + AudioExt.AAC, + } + or AudioExt.M4A in audio_info["format_name"] + ): + raise InvalidAudioFormatError(audio_path, [e.value for e in AudioExt]) + audio_paths.append(audio_path) + + dataset_path = TRAINING_AUDIO_DIR / name.strip() + + dataset_path.mkdir(parents=True, exist_ok=True) + + for audio_path in audio_paths: + shutil.copyfile(audio_path, dataset_path / audio_path.name) + + return dataset_path + + +def preprocess_dataset( + model_name: str, + dataset: StrPath, + sample_rate: TrainingSampleRate = TrainingSampleRate.HZ_40K, + normalization_mode: AudioNormalizationMode = AudioNormalizationMode.POST, + filter_audio: bool = True, + clean_audio: bool = False, + clean_strength: float = 0.7, + split_method: AudioSplitMethod = AudioSplitMethod.AUTOMATIC, + chunk_len: float = 3.0, + overlap_len: float = 0.3, + cpu_cores: int = cpu_count(), +) -> None: + """ + Preprocess a dataset of audio files for training a voice model. + + Parameters + ---------- + model_name : str + The name of the voice model to train. If no voice model + with the provided name exists for training, a new voice model + for training will be created with the provided name. If a voice + model with the provided name already exists for training, then + its currently associated dataset will be replaced with the + provided dataset. + dataset : StrPath + The path to the dataset to preprocess. + sample_rate : TrainingSampleRate, default=TrainingSampleRate.HZ_40K + The target sample rate for the audio files in the provided + dataset. + normalization_mode : AudioNormalizationMode, default=POST + The audio normalization method to use for the audio files in + the provided dataset. + filter_audio : bool, default=True + Whether to remove low-frequency sounds from the audio files in + the provided dataset by applying a high-pass butterworth filter. + clean_audio : bool, default=False + Whether to clean the audio files in the provided dataset using + noise reduction algorithms. + clean_strength : float, default=0.7 + The intensity of the cleaning to apply to the audio files in the + provided dataset. + split_method : AudioSplitMethod, default=AudioSplitMethod.AUTOMATIC + The method to use for splitting the audio files in the provided + dataset. Use the `Skip` method to skip splitting if the audio + files are already split. Use the `Simple` method if excessive + silence has already been removed from the audio files. + Use the `Automatic` method for automatic silence detection and + splitting around it. + chunk_len: float, default=3.0 + length of split audio chunks when using the `Simple` split + method. + overlap_len: float, default=0.3 + length of overlap between split audio chunks when using the + `Simple` split method. + cpu_cores : int, default=cpu_count() + The number of CPU cores to use for preprocessing. + + + Raises + ------ + NotProvidedError + If no model name or dataset is provided. + + """ + if not model_name: + raise NotProvidedError(Entity.MODEL_NAME) + + dataset_path = validate_audio_dir_exists(dataset, Entity.DATASET) + + model_path = TRAINING_MODELS_DIR / model_name.strip() + model_path.mkdir(parents=True, exist_ok=True) + + # NOTE The lazy_import function does not work with the package below + # so we import it here manually + from rvc_logic.rvc.train.preprocess import ( # noqa: PLC0415 + preprocess as train_preprocess, + ) + + train_preprocess.preprocess_training_set( + str(dataset_path), + sample_rate, + cpu_cores, + str(model_path), + split_method, + filter_audio, + clean_audio, + clean_strength, + chunk_len, + overlap_len, + normalization_mode, + ) +ion_mode, + ) diff --git a/rvc_logic/core_train/train.py b/rvc_logic/core_train/train.py new file mode 100644 index 0000000000000000000000000000000000000000..5d8ec9794f326adc6bde7b35d068c039c07ae3bc --- /dev/null +++ b/rvc_logic/core_train/train.py @@ -0,0 +1,369 @@ +""" +Module which exposes functionality for training voice conversion +models. +""" + +from __future__ import annotations + +import logging +import os +import re +import signal + +from rvc_logic.common import PRETRAINED_MODELS_DIR +from rvc_logic.core.common import ( + TRAINING_MODELS_DIR, + VOICE_MODELS_DIR, + copy_files_to_new_dir, + json_dump, + json_load, + validate_model, +) +from rvc_logic.core.exceptions import ( + Entity, + ModelAsssociatedEntityNotFoundError, + ModelExistsError, + NotProvidedError, + PretrainedModelIncompatibleError, + PretrainedModelNotAvailableError, + Step, +) +from rvc_logic.core.train.common import validate_devices +from rvc_logic.core.train.typing_extra import ModelInfo, TrainingInfo +from rvc_logic.typing_extra import ( + DeviceType, + IndexAlgorithm, + PrecisionType, + PretrainedType, + TrainingSampleRate, + Vocoder, +) + +logger = logging.getLogger(__name__) + + +def _get_pretrained_model( + pretrained_type: PretrainedType, + vocoder: Vocoder, + sample_rate: TrainingSampleRate, + custom_pretrained: str | None = None, +) -> tuple[str, str]: + """ + Get the pretrained model to finetune a voice model on. + + Parameters + ---------- + pretrained_type : PretrainedType + The type of pretrained model to finetune the voice model on + vocoder : str + The vocoder to use for audio synthesis when training the voice + model. + sample_rate : int + The sample rate of the preprocessed dataset associated with the + voice model to be trained. + custom_pretrained : str, optional + The name of a custom pretrained model to finetune the voice + model on + + Returns + ------- + pg : str + The path to the generator of the pretrained model to finetune. + pd : str + The path to the discriminator of the pretrained model to + finetune. + + Raises + ------ + ModelAsssociatedEntityNotFoundError + If the voice model to be trained does not have an associated + dataset file list or if a custom pretrained + generator/discriminator model does not have an associated + generator or discriminator. + PretrainedModelIncompatibleError + if a custom pretrained model is not compatible with the sample + rate of the preprocessed dataset associated with the voice model + to be trained. + PretrainedModelNotAvailableError + If no default pretrained model is available for the provided + vocoder and sample rate. + + """ + match pretrained_type: + case PretrainedType.NONE: + pg, pd = "", "" + case PretrainedType.DEFAULT: + base_path = PRETRAINED_MODELS_DIR / vocoder.lower() + pg = base_path / f"f0G{str(sample_rate)[:2]}k.pth" + pd = base_path / f"f0D{str(sample_rate)[:2]}k.pth" + if not pg.is_file() or not pd.is_file(): + raise PretrainedModelNotAvailableError( + name=vocoder, sample_rate=sample_rate, download=False + ) + pg, pd = str(pg), str(pd) + case PretrainedType.CUSTOM: + custom_pretrained_path = validate_model( + custom_pretrained, + Entity.CUSTOM_PRETRAINED_MODEL, + ) + # NOTE simply done to appease the type checker + custom_pretrained = custom_pretrained_path.name + + # TODO need to make this cleaner + custom_pretrained_sample_rate = int(custom_pretrained.split(" ")[-1]) + if not custom_pretrained_sample_rate == sample_rate: + raise PretrainedModelIncompatibleError(custom_pretrained, sample_rate) + + pg = next( + ( + str(path) + for path in custom_pretrained_path.iterdir() + if re.match(r"^(G|f0G).*\.pth$|.*G\.pth$", path.name) + ), + None, + ) + if pg is None: + raise ModelAsssociatedEntityNotFoundError( + Entity.GENERATOR, + custom_pretrained, + ) + pd = next( + ( + str(path) + for path in custom_pretrained_path.iterdir() + if re.match(r"^(D|f0D).*\.pth$|.*D\.pth$", path.name) + ), + None, + ) + if pd is None: + raise ModelAsssociatedEntityNotFoundError( + Entity.DISCRIMINATOR, + custom_pretrained, + ) + + return pg, pd + + +def run_training( + model_name: str, + num_epochs: int = 500, + batch_size: int = 8, + detect_overtraining: bool = False, + overtraining_threshold: int = 50, + vocoder: Vocoder = Vocoder.HIFI_GAN, + index_algorithm: IndexAlgorithm = IndexAlgorithm.AUTO, + pretrained_type: PretrainedType = PretrainedType.DEFAULT, + custom_pretrained: str | None = None, + save_interval: int = 10, + save_all_checkpoints: bool = False, + save_all_weights: bool = False, + clear_saved_data: bool = False, + upload_model: bool = False, + upload_name: str | None = None, + hardware_acceleration: DeviceType = DeviceType.AUTOMATIC, + gpu_ids: set[int] | None = None, + precision: PrecisionType = PrecisionType.FP32, + preload_dataset: bool = False, + reduce_memory_usage: bool = False, +) -> list[str] | None: + """ + + Train a voice model using its associated preprocessed dataset and + extracted features. + + Parameters + ---------- + model_name : str + The name of the voice model to train. + num_epochs : int, default=500 + The number of epochs to train the voice model. A higher number + can improve voice model performance but may lead to + overtraining. + batch_size : int, default=8 + The number of samples to include in each training batch. It is + advisable to align this value with the available VRAM of your + GPU. A setting of 4 offers improved accuracy but slower + processing, while 8 provides faster and standard results. + detect_overtraining : bool, default=False + Whether to detect overtraining to prevent the voice model from + learning the training data too well and losing the ability to + generalize to new data. + overtraining_threshold : int, default=50 + The maximum number of epochs to continue training without any + observed improvement in voice model performance. + vocoder : Vocoder, default=Vocoder.HIFI_GAN + The vocoder to use for audio synthesis during training. HiFi-GAN + provides basic audio fidelity, while RefineGAN provides the + highest audio fidelity. + index_algorithm : IndexAlgorithm, default=IndexAlgorithm.AUTO + The method to use for generating an index file for the trained + voice model. KMeans is particularly useful for large datasets. + pretrained_type : PretrainedType, default=PretrainedType.DEFAULT + The type of pretrained model to finetune the voice model on. + "None" will train the voice model from scratch, while + "Default" will use a pretrained model tailored to the specific + voice model architecture. "Custom" will use a custom pretrained + model that you provide. + custom_pretrained: str, optional + The name of a custom pretrained model to finetune the voice + model on. + save_interval : int, default=10 + The epoch interval at which to to save voice model weights and + checkpoints. The best model weights are always saved regardless + of this setting. + save_all_checkpoints : bool, default=False + Whether to save a unique checkpoint at each save interval. If + not enabled, only the latest checkpoint will be saved at each + interval. + save_all_weights : bool, default=False + Whether to save unique voice model weights at each save + interval. If not enabled, only the best voice model weights will + be saved. + clear_saved_data : bool, default=False + Whether to delete any existing training data associated + with the voice model before training commences. Enable this + setting only if you are training a new voice model from scratch + or restarting training. + upload_model : bool, default=False + Whether to automatically upload the trained voice model so that + it can be used for audio generation tasks within the Ultimate + RVC app. + upload_name : str, optional + The name to give the uploaded voice model. + hardware_acceleration : DeviceType, default=DeviceType.AUTOMATIC + The type of hardware acceleration to use when training the voice + model. `AUTOMATIC` will select the first available GPU and fall + back to CPU if no GPUs are available. + gpu_ids : set[int], optional + Set of ids of the GPUs to use for training the voice model when + `GPU` is selected for hardware acceleration. + precision : PrecisionType, default=PrecisionType.FP32 + The precision type to use when training the voice model. FP16 + and BF16 can reduce VRAM usage and speed up training on + supported hardware. + preload_dataset : bool, default=False + Whether to preload all training data into GPU memory. This can + improve training speed but requires a lot of VRAM. + reduce_memory_usage : bool, default=False + Whether to reduce VRAM usage at the cost of slower training + speed by enabling activation checkpointing. This is useful for + GPUs with limited memory (e.g., <6GB VRAM) or when training with + a batch size larger than what your GPU can normally accommodate. + + Returns + ------- + list[str] | None + A list containing the paths to the best weights file and the + index file for the trained voice model, if they exist. + Otherwise, None. + + Raises + ------ + ModelAsssociatedEntityNotFoundError + If the voice model to be trained does not have an associated + dataset file list. + NotProvidedError + If an upload name is not provided when the upload parameter is + set + ModelExistsError + If a voice with the provided upload name already exists when the + upload parameter is set + + + """ + model_path = validate_model(model_name, Entity.TRAINING_MODEL) + filelist_path = model_path / "filelist.txt" + if not filelist_path.is_file(): + raise ModelAsssociatedEntityNotFoundError( + Entity.DATASET_FILE_LIST, + model_name, + Step.FEATURE_EXTRACTION, + ) + upload_model_path = None + if upload_model: + if not upload_name: + raise NotProvidedError(Entity.UPLOAD_NAME) + upload_model_path = VOICE_MODELS_DIR / upload_name.strip() + if upload_model_path.is_dir(): + raise ModelExistsError(Entity.VOICE_MODEL, upload_name) + + model_info_dict = json_load(model_path / "model_info.json") + + model_info = ModelInfo.model_validate(model_info_dict) + sample_rate = model_info.sample_rate + + pg, pd = _get_pretrained_model( + pretrained_type, + vocoder, + sample_rate, + custom_pretrained, + ) + + from rvc_logic.rvc.train.train import main as train_main # noqa: PLC0415 + + device_type, device_ids = validate_devices(hardware_acceleration, gpu_ids) + + train_main( + model_name, + sample_rate, + vocoder, + num_epochs, + batch_size, + save_interval, + not save_all_checkpoints, + save_all_weights, + pg, + pd, + detect_overtraining, + overtraining_threshold, + clear_saved_data, + preload_dataset, + reduce_memory_usage, + device_type, + device_ids, + precision, + ) + + model_file = model_path / f"{model_name}_best.pth" + + if not model_file.is_file(): + return None + + from rvc_logic.rvc.train.process.extract_index import ( # noqa: PLC0415 + main as extract_index_main, + ) + + extract_index_main(str(model_path), index_algorithm) + + index_file = model_path / f"{model_name}.index" + + if not index_file.is_file(): + return None + if upload_model_path: + copy_files_to_new_dir([index_file, model_file], upload_model_path) + return [str(model_file), str(index_file)] + + +def stop_training(model_name: str) -> None: + """ + Stop the training of a voice model. + + Parameters + ---------- + model_name : str + The name of the voice model to stop training for. + + """ + training_info_path = TRAINING_MODELS_DIR / model_name / "config.json" + try: + training_info_dict = json_load(training_info_path) + training_info = TrainingInfo.model_validate(training_info_dict) + process_ids = training_info.process_pids + for pid in process_ids: + os.kill(pid, signal.SIGTERM) + training_info.process_pids = [] + updated_training_info_dict = training_info.model_dump() + json_dump(updated_training_info_dict, training_info_path) + except Exception as e: # noqa: BLE001 + logger.error("Error stopping training: %s", e) # noqa: TRY400 +s", e) # noqa: TRY400 diff --git a/rvc_logic/core_train/typing_extra.py b/rvc_logic/core_train/typing_extra.py new file mode 100644 index 0000000000000000000000000000000000000000..5cf286043368c4346886461fa394195c92f36491 --- /dev/null +++ b/rvc_logic/core_train/typing_extra.py @@ -0,0 +1,43 @@ +""" +Module which defines extra types used by modules in the +rvc_logic.core.train package. +""" + +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict + +from rvc_logic.typing_extra import TrainingSampleRate # noqa: TC002 + + +class ModelInfo(BaseModel): + """ + Information about a voice model to be trained. + + Attributes + ---------- + sample_rate : TrainingSampleRate + The sample rate of the post-processed audio to train the model + on. + + """ + + sample_rate: TrainingSampleRate + # TODO add more attributes later + + +class TrainingInfo(BaseModel): + """ + Information about the ongoing training of a voice model. + + Attributes + ---------- + process_pids : list[int], default = [] + The ids of the processes running the training. + + """ + + process_pids: list[int] = [] + # TODO add more attributes later + model_config = ConfigDict(extra="allow") +ow") diff --git a/rvc_logic/rvc/__init__.py b/rvc_logic/rvc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bad38dab81b953f3f9d64e267fe9e3a65b2b5516 --- /dev/null +++ b/rvc_logic/rvc/__init__.py @@ -0,0 +1,4 @@ +""" +The rvc package is a collection of tools for voice cloning using the RVC +method. +""" diff --git a/rvc_logic/rvc/common.py b/rvc_logic/rvc/common.py new file mode 100644 index 0000000000000000000000000000000000000000..ffce659f74236acad9c1fbee2f25cdf65e5b1465 --- /dev/null +++ b/rvc_logic/rvc/common.py @@ -0,0 +1,9 @@ +"""Common constants and functions for the RVC package.""" + +from __future__ import annotations + +from pathlib import Path + +RVC_DIR = Path(__file__).resolve().parent +RVC_CONFIGS_DIR = RVC_DIR / "configs" +RVC_TRAINING_MODELS_DIR = RVC_DIR / "train" / "models" diff --git a/rvc_logic/rvc/configs/32000.json b/rvc_logic/rvc/configs/32000.json new file mode 100644 index 0000000000000000000000000000000000000000..0beacf7060f507416d5ae44e3b3098ec0c3241b6 --- /dev/null +++ b/rvc_logic/rvc/configs/32000.json @@ -0,0 +1,75 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 1e-4, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-9, + "lr_decay": 0.999875, + "segment_size": 12800, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 32000, + "filter_length": 1024, + "hop_length": 320, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 10, + 8, + 2, + 2 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 20, + 16, + 4, + 4 + ], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} \ No newline at end of file diff --git a/rvc_logic/rvc/configs/40000.json b/rvc_logic/rvc/configs/40000.json new file mode 100644 index 0000000000000000000000000000000000000000..46c06427d8b21e328b58f0dccec341641e8b2f22 --- /dev/null +++ b/rvc_logic/rvc/configs/40000.json @@ -0,0 +1,75 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 1e-4, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-9, + "lr_decay": 0.999875, + "segment_size": 12800, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 40000, + "filter_length": 2048, + "hop_length": 400, + "win_length": 2048, + "n_mel_channels": 125, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 10, + 10, + 2, + 2 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} \ No newline at end of file diff --git a/rvc_logic/rvc/configs/48000.json b/rvc_logic/rvc/configs/48000.json new file mode 100644 index 0000000000000000000000000000000000000000..218bea43990b1208fc0fb912f9d2e25bdf5c46ea --- /dev/null +++ b/rvc_logic/rvc/configs/48000.json @@ -0,0 +1,75 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 1e-4, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-9, + "lr_decay": 0.999875, + "segment_size": 17280, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 48000, + "filter_length": 2048, + "hop_length": 480, + "win_length": 2048, + "n_mel_channels": 128, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 12, + 10, + 2, + 2 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 24, + 20, + 4, + 4 + ], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} \ No newline at end of file diff --git a/rvc_logic/rvc/configs/config.py b/rvc_logic/rvc/configs/config.py new file mode 100644 index 0000000000000000000000000000000000000000..ed07b241f956c50c0b3593945f07d21f417f4784 --- /dev/null +++ b/rvc_logic/rvc/configs/config.py @@ -0,0 +1,105 @@ +import json +import os +import pathlib + +import torch + +from rvc_logic.rvc.common import RVC_CONFIGS_DIR + +version_config_paths = [ + os.path.join("48000.json"), + os.path.join("40000.json"), + os.path.join("32000.json"), +] + + +def singleton(cls): + instances = {} + + def get_instance(*args, **kwargs): + if cls not in instances: + instances[cls] = cls(*args, **kwargs) + return instances[cls] + + return get_instance + + +@singleton +class Config: + def __init__(self): + self.device = "cuda:0" if torch.cuda.is_available() else "cpu" + self.gpu_name = ( + torch.cuda.get_device_name(int(self.device.split(":")[-1])) + if self.device.startswith("cuda") + else None + ) + self.json_config = self.load_config_json() + self.gpu_mem = None + self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() + + def load_config_json(self) -> dict: + configs = {} + for config_file in version_config_paths: + config_path = os.path.join(str(RVC_CONFIGS_DIR), config_file) + with pathlib.Path(config_path).open() as f: + configs[config_file] = json.load(f) + return configs + + def device_config(self): + if self.device.startswith("cuda"): + self.set_cuda_config() + else: + self.device = "cpu" + + # Configuration for 6GB GPU memory + x_pad, x_query, x_center, x_max = (1, 6, 38, 41) + if self.gpu_mem is not None and self.gpu_mem <= 4: + # Configuration for 5GB GPU memory + x_pad, x_query, x_center, x_max = (1, 5, 30, 32) + + return x_pad, x_query, x_center, x_max + + def set_cuda_config(self): + i_device = int(self.device.split(":")[-1]) + self.gpu_name = torch.cuda.get_device_name(i_device) + + self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // ( + 1024**3 + ) + + +def max_vram_gpu(gpu): + if torch.cuda.is_available(): + gpu_properties = torch.cuda.get_device_properties(gpu) + total_memory_gb = round(gpu_properties.total_memory / 1024 / 1024 / 1024) + return total_memory_gb + return "8" + + +def get_gpu_info(): + ngpu = torch.cuda.device_count() + gpu_infos = [] + if torch.cuda.is_available() or ngpu != 0: + for i in range(ngpu): + gpu_name = torch.cuda.get_device_name(i) + mem = int( + torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + + 0.4, + ) + gpu_infos.append(f"{i}: {gpu_name} ({mem} GB)") + if len(gpu_infos) > 0: + gpu_info = "\n".join(gpu_infos) + else: + gpu_info = ( + "Unfortunately, there is no compatible GPU available to support your" + " training." + ) + return gpu_info + + +def get_number_of_gpus(): + if torch.cuda.is_available(): + num_gpus = torch.cuda.device_count() + return "-".join(map(str, range(num_gpus))) + return "-" +" diff --git a/rvc_logic/rvc/infer/infer.py b/rvc_logic/rvc/infer/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..7a042d785d5af3dea93149328fd4c3f6660fcc41 --- /dev/null +++ b/rvc_logic/rvc/infer/infer.py @@ -0,0 +1,528 @@ +from typing import TYPE_CHECKING, Unpack + +import logging +import os +import pathlib +import sys +import time +import traceback + +import soxr + +import numpy as np + +import torch + +import librosa +import soundfile as sf +from pedalboard import ( + Bitcrush, + Chorus, + Clipping, + Compressor, + Delay, + Distortion, + Gain, + Limiter, + Pedalboard, + PitchShift, + Reverb, +) + +now_dir = pathlib.Path.cwd() +sys.path.append(str(now_dir)) +import lazy_loader as lazy + +from rvc_logic.rvc.configs.config import Config +from rvc_logic.rvc.infer.pipeline import Pipeline as VC +from rvc_logic.rvc.infer.typing_extra import ConvertAudioKwArgs +from rvc_logic.rvc.lib.algorithm.synthesizers import Synthesizer +from rvc_logic.rvc.lib.tools.split_audio import merge_audio, process_audio +from rvc_logic.rvc.lib.utils import load_audio_infer, load_embedding +from rvc_logic.typing_extra import F0Method + +if TYPE_CHECKING: + import noisereduce as nr +else: + nr = lazy.load("noisereduce") + +# logging.getLogger("httpx").setLevel(logging.WARNING) +# logging.getLogger("httpcore").setLevel(logging.WARNING) +# logging.getLogger("faiss").setLevel(logging.WARNING) +# logging.getLogger("faiss.loader").setLevel(logging.WARNING) +logger = logging.getLogger(__name__) + + +class VoiceConverter: + """ + A class for performing voice conversion using the Retrieval-Based Voice Conversion (RVC) method. + """ + + def __init__(self): + """ + Initializes the VoiceConverter with default configuration, and sets up models and parameters. + """ + self.config = Config() # Load configuration + self.hubert_model = ( + None # Initialize the Hubert model (for embedding extraction) + ) + self.last_embedder_model = None # Last used embedder model + self.tgt_sr = None # Target sampling rate for the output audio + self.net_g = None # Generator network for voice conversion + self.vc = None # Voice conversion pipeline instance + self.cpt = None # Checkpoint for loading model weights + self.version = None # Model version + self.n_spk = None # Number of speakers in the model + self.use_f0 = None # Whether the model uses F0 + self.loaded_model = None + + def load_hubert(self, embedder_model: str, embedder_model_custom: str = None): + """ + Loads the HuBERT model for speaker embedding extraction. + + Args: + embedder_model (str): Path to the pre-trained HuBERT model. + embedder_model_custom (str): Path to the custom HuBERT model. + + """ + self.hubert_model = load_embedding(embedder_model, embedder_model_custom) + self.hubert_model = self.hubert_model.to(self.config.device).float() + self.hubert_model.eval() + + @staticmethod + def remove_audio_noise(data, sr, reduction_strength=0.7): + """ + Removes noise from an audio file using the NoiseReduce library. + + Args: + data (numpy.ndarray): The audio data as a NumPy array. + sr (int): The sample rate of the audio data. + reduction_strength (float): Strength of the noise reduction. Default is 0.7. + + """ + try: + + reduced_noise = nr.reduce_noise( + y=data, + sr=sr, + prop_decrease=reduction_strength, + ) + return reduced_noise + except Exception as error: + print(f"An error occurred removing audio noise: {error}") + return None + + @staticmethod + def convert_audio_format(input_path, output_path, output_format): + """ + Converts an audio file to a specified output format. + + Args: + input_path (str): Path to the input audio file. + output_path (str): Path to the output audio file. + output_format (str): Desired audio format (e.g., "WAV", "MP3"). + + """ + try: + if output_format != "WAV": + print(f"Saving audio as {output_format}...") + audio, sample_rate = librosa.load(input_path, sr=None) + common_sample_rates = [ + 8000, + 11025, + 12000, + 16000, + 22050, + 24000, + 32000, + 44100, + 48000, + ] + target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate)) + audio = librosa.resample( + audio, + orig_sr=sample_rate, + target_sr=target_sr, + res_type="soxr_vhq", + ) + sf.write(output_path, audio, target_sr, format=output_format.lower()) + return output_path + except Exception as error: + print(f"An error occurred converting the audio format: {error}") + + @staticmethod + def post_process_audio( + audio_input, + sample_rate, + **kwargs, + ): + board = Pedalboard() + if kwargs.get("reverb"): + reverb = Reverb( + room_size=kwargs.get("reverb_room_size", 0.5), + damping=kwargs.get("reverb_damping", 0.5), + wet_level=kwargs.get("reverb_wet_level", 0.33), + dry_level=kwargs.get("reverb_dry_level", 0.4), + width=kwargs.get("reverb_width", 1.0), + freeze_mode=kwargs.get("reverb_freeze_mode", 0), + ) + board.append(reverb) + if kwargs.get("pitch_shift"): + pitch_shift = PitchShift(semitones=kwargs.get("pitch_shift_semitones", 0)) + board.append(pitch_shift) + if kwargs.get("limiter"): + limiter = Limiter( + threshold_db=kwargs.get("limiter_threshold", -6), + release_ms=kwargs.get("limiter_release", 0.05), + ) + board.append(limiter) + if kwargs.get("gain"): + gain = Gain(gain_db=kwargs.get("gain_db", 0)) + board.append(gain) + if kwargs.get("distortion"): + distortion = Distortion(drive_db=kwargs.get("distortion_gain", 25)) + board.append(distortion) + if kwargs.get("chorus"): + chorus = Chorus( + rate_hz=kwargs.get("chorus_rate", 1.0), + depth=kwargs.get("chorus_depth", 0.25), + centre_delay_ms=kwargs.get("chorus_delay", 7), + feedback=kwargs.get("chorus_feedback", 0.0), + mix=kwargs.get("chorus_mix", 0.5), + ) + board.append(chorus) + if kwargs.get("bitcrush"): + bitcrush = Bitcrush(bit_depth=kwargs.get("bitcrush_bit_depth", 8)) + board.append(bitcrush) + if kwargs.get("clipping"): + clipping = Clipping(threshold_db=kwargs.get("clipping_threshold", 0)) + board.append(clipping) + if kwargs.get("compressor"): + compressor = Compressor( + threshold_db=kwargs.get("compressor_threshold", 0), + ratio=kwargs.get("compressor_ratio", 1), + attack_ms=kwargs.get("compressor_attack", 1.0), + release_ms=kwargs.get("compressor_release", 100), + ) + board.append(compressor) + if kwargs.get("delay"): + delay = Delay( + delay_seconds=kwargs.get("delay_seconds", 0.5), + feedback=kwargs.get("delay_feedback", 0.0), + mix=kwargs.get("delay_mix", 0.5), + ) + board.append(delay) + return board(audio_input, sample_rate) + + def convert_audio( + self, + audio_input_path: str, + audio_output_path: str, + model_path: str, + index_path: str, + pitch: int = 0, + f0_method: F0Method = "rmvpe", + index_rate: float = 0.75, + volume_envelope: float = 1, + protect: float = 0.5, + split_audio: bool = False, + f0_autotune: bool = False, + f0_autotune_strength: float = 1, + embedder_model: str = "contentvec", + embedder_model_custom: str | None = None, + clean_audio: bool = False, + clean_strength: float = 0.5, + export_format: str = "WAV", + post_process: bool = False, + resample_sr: int = 0, + sid: int = 0, + proposed_pitch: bool = False, + proposed_pitch_threshold: float = 155.0, + **kwargs: Unpack[ConvertAudioKwArgs], + ): + """ + Performs voice conversion on the input audio. + + Args: + pitch (int): Key for F0 up-sampling. + index_rate (float): Rate for index matching. + volume_envelope (int): RMS mix rate. + protect (float): Protection rate for certain audio segments. + f0_method (str): Method for F0 extraction. + audio_input_path (str): Path to the input audio file. + audio_output_path (str): Path to the output audio file. + model_path (str): Path to the voice conversion model. + index_path (str): Path to the index file. + split_audio (bool): Whether to split the audio for processing. + f0_autotune (bool): Whether to use F0 autotune. + clean_audio (bool): Whether to clean the audio. + clean_strength (float): Strength of the audio cleaning. + export_format (str): Format for exporting the audio. + embedder_model (str): Path to the embedder model. + embedder_model_custom (str): Path to the custom embedder model. + resample_sr (int, optional): Resample sampling rate. Default is 0. + sid (int, optional): Speaker ID. Default is 0. + **kwargs: Additional keyword arguments. + + """ + if not model_path: + logger.info("No model path provided. Aborting conversion.") + return + + self.get_vc(model_path, sid) + start_time = time.time() + logger.info("Converting audio '%s'...", audio_input_path) + + audio = load_audio_infer( + audio_input_path, + 16000, + **kwargs, + ) + audio_max = np.abs(audio).max() / 0.95 + + if audio_max > 1: + audio /= audio_max + + if not self.hubert_model or embedder_model != self.last_embedder_model: + self.load_hubert(embedder_model, embedder_model_custom) + self.last_embedder_model = embedder_model + + file_index = ( + index_path.strip() + .strip('"') + .strip("\n") + .strip('"') + .strip() + .replace("trained", "added") + ) + + if self.tgt_sr != resample_sr >= 16000: + self.tgt_sr = resample_sr + + if split_audio: + chunks, intervals = process_audio(audio, 16000) + logger.info("Audio split into %d chunks for processing.", len(chunks)) + else: + chunks = [] + chunks.append(audio) + + converted_chunks = [] + for c in chunks: + audio_opt = self.vc.pipeline( + model=self.hubert_model, + net_g=self.net_g, + sid=sid, + audio=c, + pitch=pitch, + f0_method=f0_method or F0Method.RMVPE, + file_index=file_index, + index_rate=index_rate, + pitch_guidance=self.use_f0, + volume_envelope=volume_envelope, + version=self.version, + protect=protect, + f0_autotune=f0_autotune, + f0_autotune_strength=f0_autotune_strength, + proposed_pitch=proposed_pitch, + proposed_pitch_threshold=proposed_pitch_threshold, + ) + converted_chunks.append(audio_opt) + if split_audio: + logger.info("Converted audio chunk %d", len(converted_chunks)) + + if split_audio: + audio_opt = merge_audio( + chunks, + converted_chunks, + intervals, + 16000, + self.tgt_sr, + ) + else: + audio_opt = converted_chunks[0] + + if clean_audio: + cleaned_audio = self.remove_audio_noise( + audio_opt, + self.tgt_sr, + clean_strength, + ) + if cleaned_audio is not None: + audio_opt = cleaned_audio + + if post_process: + audio_opt = self.post_process_audio( + audio_input=audio_opt, + sample_rate=self.tgt_sr, + **kwargs, + ) + + sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV") + output_path_format = audio_output_path.replace( + ".wav", + f".{export_format.lower()}", + ) + audio_output_path = self.convert_audio_format( + audio_output_path, + output_path_format, + export_format, + ) + + elapsed_time = time.time() - start_time + logger.info( + "Conversion completed at '%s' in %.2f seconds.", + audio_output_path, + elapsed_time, + ) + + def convert_audio_batch( + self, + audio_input_paths: str, + audio_output_path: str, + **kwargs, + ): + """ + Performs voice conversion on a batch of input audio files. + + Args: + audio_input_paths (str): List of paths to the input audio files. + audio_output_path (str): Path to the output audio file. + resample_sr (int, optional): Resample sampling rate. Default is 0. + sid (int, optional): Speaker ID. Default is 0. + **kwargs: Additional keyword arguments. + + """ + pid = os.getpid() + try: + with pathlib.Path(os.path.join(now_dir, "assets", "infer_pid.txt")).open( + "w", + ) as pid_file: + pid_file.write(str(pid)) + start_time = time.time() + print(f"Converting audio batch '{audio_input_paths}'...") + audio_files = [ + f + for f in os.listdir(audio_input_paths) + if f.lower().endswith( + ( + "wav", + "mp3", + "flac", + "ogg", + "opus", + "m4a", + "mp4", + "aac", + "alac", + "wma", + "aiff", + "webm", + "ac3", + ), + ) + ] + print(f"Detected {len(audio_files)} audio files for inference.") + for a in audio_files: + new_input = os.path.join(audio_input_paths, a) + new_output = os.path.splitext(a)[0] + "_output.wav" + new_output = os.path.join(audio_output_path, new_output) + if pathlib.Path(new_output).exists(): + continue + self.convert_audio( + audio_input_path=new_input, + audio_output_path=new_output, + **kwargs, + ) + print(f"Conversion completed at '{audio_input_paths}'.") + elapsed_time = time.time() - start_time + print(f"Batch conversion completed in {elapsed_time:.2f} seconds.") + except Exception as error: + print(f"An error occurred during audio batch conversion: {error}") + print(traceback.format_exc()) + finally: + pathlib.Path(os.path.join(now_dir, "assets", "infer_pid.txt")).unlink() + + def get_vc(self, weight_root, sid): + """ + Loads the voice conversion model and sets up the pipeline. + + Args: + weight_root (str): Path to the model weights. + sid (int): Speaker ID. + + """ + if sid == "" or sid == []: + self.cleanup_model() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + if not self.loaded_model or self.loaded_model != weight_root: + self.load_model(weight_root) + if self.cpt is not None: + self.setup_network() + self.setup_vc_instance() + self.loaded_model = weight_root + else: + self.vc = None + self.loaded_model = None + + def cleanup_model(self): + """ + Cleans up the model and releases resources. + """ + if self.hubert_model is not None: + del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr + self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + del self.net_g, self.cpt + if torch.cuda.is_available(): + torch.cuda.empty_cache() + self.cpt = None + + def load_model(self, weight_root): + """ + Loads the model weights from the specified path. + + Args: + weight_root (str): Path to the model weights. + + """ + self.cpt = ( + torch.load(weight_root, map_location="cpu", weights_only=False) + if pathlib.Path(weight_root).is_file() + else None + ) + + def setup_network(self): + """ + Sets up the network configuration based on the loaded checkpoint. + """ + if self.cpt is not None: + self.tgt_sr = self.cpt["config"][-1] + self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] + self.use_f0 = self.cpt.get("f0", 1) + + self.version = self.cpt.get("version", "v1") + self.text_enc_hidden_dim = 768 if self.version == "v2" else 256 + self.vocoder = self.cpt.get("vocoder", "HiFi-GAN") + self.net_g = Synthesizer( + *self.cpt["config"], + use_f0=self.use_f0, + text_enc_hidden_dim=self.text_enc_hidden_dim, + vocoder=self.vocoder, + ) + del self.net_g.enc_q + self.net_g.load_state_dict(self.cpt["weight"], strict=False) + self.net_g = self.net_g.to(self.config.device).float() + self.net_g.eval() + + def setup_vc_instance(self): + """ + Sets up the voice conversion pipeline instance based on the target sampling rate and configuration. + """ + if self.cpt is not None: + self.vc = VC(self.tgt_sr, self.config) + self.n_spk = self.cpt["config"][-3] +f.cpt["config"][-3] diff --git a/rvc_logic/rvc/infer/pipeline.py b/rvc_logic/rvc/infer/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..a04732c6b4043587f51dff21eccfd2e5ed25f6b0 --- /dev/null +++ b/rvc_logic/rvc/infer/pipeline.py @@ -0,0 +1,581 @@ +import pathlib +import sys + +import numpy as np +from scipy import signal + +import faiss +import torch +import torch.nn.functional as F + +import librosa + +now_dir = pathlib.Path.cwd() +sys.path.append(str(now_dir)) + +import logging + +from rvc_logic.rvc.lib.predictors.f0 import CREPE, FCPE, RMVPE + +# logging.getLogger("faiss").setLevel(logging.WARNING) +logger = logging.getLogger(__name__) + +# Constants for high-pass filter +FILTER_ORDER = 5 +CUTOFF_FREQUENCY = 48 # Hz +SAMPLE_RATE = 16000 # Hz +bh, ah = signal.butter( + N=FILTER_ORDER, + Wn=CUTOFF_FREQUENCY, + btype="high", + fs=SAMPLE_RATE, +) + + +class AudioProcessor: + """ + A class for processing audio signals, specifically for adjusting RMS levels. + """ + + def change_rms( + source_audio: np.ndarray, + source_rate: int, + target_audio: np.ndarray, + target_rate: int, + rate: float, + ) -> np.ndarray: + """ + Adjust the RMS level of target_audio to match the RMS of source_audio, with a given blending rate. + + Args: + source_audio: The source audio signal as a NumPy array. + source_rate: The sampling rate of the source audio. + target_audio: The target audio signal to adjust. + target_rate: The sampling rate of the target audio. + rate: The blending rate between the source and target RMS levels. + + """ + # Calculate RMS of both audio data + rms1 = librosa.feature.rms( + y=source_audio, + frame_length=source_rate // 2 * 2, + hop_length=source_rate // 2, + ) + rms2 = librosa.feature.rms( + y=target_audio, + frame_length=target_rate // 2 * 2, + hop_length=target_rate // 2, + ) + + # Interpolate RMS to match target audio length + rms1 = F.interpolate( + torch.from_numpy(rms1).float().unsqueeze(0), + size=target_audio.shape[0], + mode="linear", + ).squeeze() + rms2 = F.interpolate( + torch.from_numpy(rms2).float().unsqueeze(0), + size=target_audio.shape[0], + mode="linear", + ).squeeze() + rms2 = torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6) + + # Adjust target audio RMS based on the source audio RMS + adjusted_audio = ( + target_audio + * (torch.pow(rms1, 1 - rate) * torch.pow(rms2, rate - 1)).numpy() + ) + return adjusted_audio + + +class Autotune: + """ + A class for applying autotune to a given fundamental frequency (F0) contour. + """ + + def __init__(self): + """ + Initializes the Autotune class with a set of reference frequencies. + """ + self.note_dict = [ + 49.00, # G1 + 51.91, # G#1 / Ab1 + 55.00, # A1 + 58.27, # A#1 / Bb1 + 61.74, # B1 + 65.41, # C2 + 69.30, # C#2 / Db2 + 73.42, # D2 + 77.78, # D#2 / Eb2 + 82.41, # E2 + 87.31, # F2 + 92.50, # F#2 / Gb2 + 98.00, # G2 + 103.83, # G#2 / Ab2 + 110.00, # A2 + 116.54, # A#2 / Bb2 + 123.47, # B2 + 130.81, # C3 + 138.59, # C#3 / Db3 + 146.83, # D3 + 155.56, # D#3 / Eb3 + 164.81, # E3 + 174.61, # F3 + 185.00, # F#3 / Gb3 + 196.00, # G3 + 207.65, # G#3 / Ab3 + 220.00, # A3 + 233.08, # A#3 / Bb3 + 246.94, # B3 + 261.63, # C4 + 277.18, # C#4 / Db4 + 293.66, # D4 + 311.13, # D#4 / Eb4 + 329.63, # E4 + 349.23, # F4 + 369.99, # F#4 / Gb4 + 392.00, # G4 + 415.30, # G#4 / Ab4 + 440.00, # A4 + 466.16, # A#4 / Bb4 + 493.88, # B4 + 523.25, # C5 + 554.37, # C#5 / Db5 + 587.33, # D5 + 622.25, # D#5 / Eb5 + 659.25, # E5 + 698.46, # F5 + 739.99, # F#5 / Gb5 + 783.99, # G5 + 830.61, # G#5 / Ab5 + 880.00, # A5 + 932.33, # A#5 / Bb5 + 987.77, # B5 + 1046.50, # C6 + ] + + def autotune_f0(self, f0, f0_autotune_strength): + """ + Autotunes a given F0 contour by snapping each frequency to the closest reference frequency. + + Args: + f0: The input F0 contour as a NumPy array. + + """ + autotuned_f0 = np.zeros_like(f0) + for i, freq in enumerate(f0): + closest_note = min(self.note_dict, key=lambda x: abs(x - freq)) + autotuned_f0[i] = freq + (closest_note - freq) * f0_autotune_strength + return autotuned_f0 + + +class Pipeline: + """ + The main pipeline class for performing voice conversion, including preprocessing, F0 estimation, + voice conversion using a model, and post-processing. + """ + + def __init__(self, tgt_sr, config): + """ + Initializes the Pipeline class with target sampling rate and configuration parameters. + + Args: + tgt_sr: The target sampling rate for the output audio. + config: A configuration object containing various parameters for the pipeline. + + """ + self.x_pad = config.x_pad + self.x_query = config.x_query + self.x_center = config.x_center + self.x_max = config.x_max + self.sample_rate = 16000 + self.tgt_sr = tgt_sr + self.window = 160 + self.t_pad = self.sample_rate * self.x_pad + self.t_pad_tgt = tgt_sr * self.x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sample_rate * self.x_query + self.t_center = self.sample_rate * self.x_center + self.t_max = self.sample_rate * self.x_max + self.time_step = self.window / self.sample_rate * 1000 + self.f0_min = 50 + self.f0_max = 1100 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + self.device = config.device + self.autotune = Autotune() + + def get_f0( + self, + x, + p_len, + f0_method: str = "rmvpe", + pitch: int = 0, + f0_autotune: bool = False, + f0_autotune_strength: float = 1.0, + proposed_pitch: bool = False, + proposed_pitch_threshold: float = 155.0, + ): + """ + Estimates the fundamental frequency (F0) of a given audio signal using various methods. + + Args: + x: The input audio signal as a NumPy array. + p_len: Desired length of the F0 output. + pitch: Key to adjust the pitch of the F0 contour. + f0_method: Method to use for F0 estimation (e.g., "crepe"). + f0_autotune: Whether to apply autotune to the F0 contour. + proposed_pitch: whether to apply proposed pitch adjustment + proposed_pitch_threshold: target frequency, 155.0 for male, 255.0 for female + + """ + if f0_method == "crepe": + model = CREPE( + device=self.device, sample_rate=self.sample_rate, hop_size=self.window + ) + f0 = model.get_f0(x, self.f0_min, self.f0_max, p_len, "full") + del model + elif f0_method == "crepe-tiny": + model = CREPE( + device=self.device, sample_rate=self.sample_rate, hop_size=self.window + ) + f0 = model.get_f0(x, self.f0_min, self.f0_max, p_len, "tiny") + del model + elif f0_method == "rmvpe": + model = RMVPE( + device=self.device, sample_rate=self.sample_rate, hop_size=self.window + ) + f0 = model.get_f0(x, filter_radius=0.03) + del model + elif f0_method == "fcpe": + model = FCPE( + device=self.device, sample_rate=self.sample_rate, hop_size=self.window + ) + f0 = model.get_f0(x, p_len, filter_radius=0.006) + del model + + # f0 adjustments + if f0_autotune is True: + f0 = self.autotune.autotune_f0(f0, f0_autotune_strength) + elif proposed_pitch is True: + limit = 12 + # calculate median f0 of the audio + valid_f0 = np.where(f0 > 0)[0] + if len(valid_f0) < 2: + # no valid f0 detected + up_key = 0 + else: + median_f0 = float( + np.median(np.interp(np.arange(len(f0)), valid_f0, f0[valid_f0])) + ) + if median_f0 <= 0 or np.isnan(median_f0): + up_key = 0 + else: + # calculate proposed shift + up_key = max( + -limit, + min( + limit, + int( + np.round( + 12 * np.log2(proposed_pitch_threshold / median_f0) + ) + ), + ), + ) + logger.info("calculated pitch offset: %d", up_key) + f0 *= pow(2, (pitch + up_key) / 12) + else: + f0 *= pow(2, pitch / 12) + # quantizing f0 to 255 buckets to make coarse f0 + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / ( + self.f0_mel_max - self.f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(int) + + return f0_coarse, f0bak + + def voice_conversion( + self, + model, + net_g, + sid, + audio0, + pitch, + pitchf, + index, + big_npy, + index_rate, + version, + protect, + ): + """ + Performs voice conversion on a given audio segment. + + Args: + model: The feature extractor model. + net_g: The generative model for synthesizing speech. + sid: Speaker ID for the target voice. + audio0: The input audio segment. + pitch: Quantized F0 contour for pitch guidance. + pitchf: Original F0 contour for pitch guidance. + index: FAISS index for speaker embedding retrieval. + big_npy: Speaker embeddings stored in a NumPy array. + index_rate: Blending rate for speaker embedding retrieval. + version: Model version (Keep to support old models). + protect: Protection level for preserving the original pitch. + + """ + with torch.no_grad(): + pitch_guidance = pitch != None and pitchf != None + # prepare source audio + feats = torch.from_numpy(audio0).float() + feats = feats.mean(-1) if feats.dim() == 2 else feats + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1).to(self.device) + # extract features + feats = model(feats)["last_hidden_state"] + feats = ( + model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats + ) + # make a copy for pitch guidance and protection + feats0 = feats.clone() if pitch_guidance else None + if ( + index + ): # set by parent function, only true if index is available, loaded, and index rate > 0 + feats = self._retrieve_speaker_embeddings( + feats, + index, + big_npy, + index_rate, + ) + # feature upsampling + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute( + 0, + 2, + 1, + ) + # adjust the length if the audio is short + p_len = min(audio0.shape[0] // self.window, feats.shape[1]) + if pitch_guidance: + feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( + 0, + 2, + 1, + ) + pitch, pitchf = pitch[:, :p_len], pitchf[:, :p_len] + # Pitch protection blending + if protect < 0.5: + pitchff = pitchf.clone() + pitchff[pitchf > 0] = 1 + pitchff[pitchf < 1] = protect + feats = feats * pitchff.unsqueeze(-1) + feats0 * ( + 1 - pitchff.unsqueeze(-1) + ) + feats = feats.to(feats0.dtype) + else: + pitch, pitchf = None, None + p_len = torch.tensor([p_len], device=self.device).long() + audio1 = ( + (net_g.infer(feats.float(), p_len, pitch, pitchf.float(), sid)[0][0, 0]) + .data.cpu() + .float() + .numpy() + ) + # clean up + del feats, feats0, p_len + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio1 + + def _retrieve_speaker_embeddings(self, feats, index, big_npy, index_rate): + npy = feats[0].cpu().numpy() + score, ix = index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + ) + return feats + + def pipeline( + self, + model, + net_g, + sid, + audio, + pitch, + f0_method, + file_index, + index_rate, + pitch_guidance, + volume_envelope, + version, + protect, + f0_autotune, + f0_autotune_strength, + proposed_pitch, + proposed_pitch_threshold, + ): + """ + The main pipeline function for performing voice conversion. + + Args: + model: The feature extractor model. + net_g: The generative model for synthesizing speech. + sid: Speaker ID for the target voice. + audio: The input audio signal. + input_audio_path: Path to the input audio file. + pitch: Key to adjust the pitch of the F0 contour. + f0_method: Method to use for F0 estimation. + file_index: Path to the FAISS index file for speaker embedding retrieval. + index_rate: Blending rate for speaker embedding retrieval. + pitch_guidance: Whether to use pitch guidance during voice conversion. + tgt_sr: Target sampling rate for the output audio. + resample_sr: Resampling rate for the output audio. + version: Model version. + protect: Protection level for preserving the original pitch. + hop_length: Hop length for F0 estimation methods. + f0_autotune: Whether to apply autotune to the F0 contour. + + """ + if file_index != "" and pathlib.Path(file_index).exists() and index_rate > 0: + try: + index = faiss.read_index(file_index) + big_npy = index.reconstruct_n(0, index.ntotal) + except Exception as error: + print(f"An error occurred reading the FAISS index: {error}") + index = big_npy = None + else: + index = big_npy = None + audio = signal.filtfilt(bh, ah, audio) + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") + opt_ts = [] + if audio_pad.shape[0] > self.t_max: + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += audio_pad[i : i - self.window] + for t in range(self.t_center, audio.shape[0], self.t_center): + opt_ts.append( + t + - self.t_query + + np.where( + np.abs(audio_sum[t - self.t_query : t + self.t_query]) + == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min(), + )[0][0], + ) + s = 0 + audio_opt = [] + t = None + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + if pitch_guidance: + pitch, pitchf = self.get_f0( + audio_pad, + p_len, + f0_method, + pitch, + f0_autotune, + f0_autotune_strength, + proposed_pitch, + proposed_pitch_threshold, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if self.device == "mps": + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + for t in opt_ts: + t = t // self.window * self.window + if pitch_guidance: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt], + ) + else: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + None, + None, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt], + ) + s = t + if pitch_guidance: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt], + ) + else: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[t:], + None, + None, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt], + ) + audio_opt = np.concatenate(audio_opt) + if volume_envelope != 1: + audio_opt = AudioProcessor.change_rms( + audio, + self.sample_rate, + audio_opt, + self.tgt_sr, + volume_envelope, + ) + audio_max = np.abs(audio_opt).max() / 0.99 + if audio_max > 1: + audio_opt /= audio_max + if pitch_guidance: + del pitch, pitchf + del sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio_opt +t diff --git a/rvc_logic/rvc/infer/typing_extra.py b/rvc_logic/rvc/infer/typing_extra.py new file mode 100644 index 0000000000000000000000000000000000000000..a3264c10b71f6b6a06500b43738b4a5e5dfeeb04 --- /dev/null +++ b/rvc_logic/rvc/infer/typing_extra.py @@ -0,0 +1,58 @@ +"""Extra type definitions for the `rvc_logic.rvc.infer` package.""" + +from typing import TypedDict + + +class ConvertAudioKwArgs(TypedDict, total=False): + """Keyword arguments for the `convert_audio` function.""" + + # pre-processing arguments + formant_shifting: bool + formant_qfrency: float + formant_timbre: float + # reverb post-processing arguments + reverb: bool + reverb_room_size: float + reverb_damping: float + reverb_wet_level: float + reverb_dry_level: float + reverb_width: float + reverb_freeze_mode: int + # pitch shift post-processing arguments + pitch_shift: bool + pitch_shift_semitones: int + # limiter post-processing arguments + limiter: bool + limiter_threshold: float + limiter_release: float + # gain post-processing arguments + gain: bool + gain_db: int + # distortion post-processing arguments + distortion: bool + distortion_gain: int + # chorus post-processing arguments + chorus: bool + chorus_rate: float + chorus_depth: float + chorus_delay: int + chorus_feedback: float + chorus_mix: float + # bitcrush post-processing arguments + bitcrush: bool + bitcrush_bit_depth: int + # clipping post-processing arguments + clipping: bool + clipping_threshold: int + # compressor post-processing arguments + compressor: bool + compressor_threshold: int + compressor_ratio: int + compressor_attack: float + compressor_release: int + # delay post-processing arguments + delay: bool + delay_seconds: float + delay_feedback: float + delay_mix: float +t diff --git a/rvc_logic/rvc/lib/algorithm/__init__.py b/rvc_logic/rvc/lib/algorithm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/rvc_logic/rvc/lib/algorithm/attentions.py b/rvc_logic/rvc/lib/algorithm/attentions.py new file mode 100644 index 0000000000000000000000000000000000000000..6df651d1501bc5b45bda5d2b82d7bfdc148f9ee2 --- /dev/null +++ b/rvc_logic/rvc/lib/algorithm/attentions.py @@ -0,0 +1,258 @@ +import math + +import torch + +from rvc_logic.rvc.lib.algorithm.commons import convert_pad_shape + + +class MultiHeadAttention(torch.nn.Module): + """ + Multi-head attention module with optional relative positional encoding and proximal bias. + + Args: + channels (int): Number of input channels. + out_channels (int): Number of output channels. + n_heads (int): Number of attention heads. + p_dropout (float, optional): Dropout probability. Defaults to 0.0. + window_size (int, optional): Window size for relative positional encoding. Defaults to None. + heads_share (bool, optional): Whether to share relative positional embeddings across heads. Defaults to True. + block_length (int, optional): Block length for local attention. Defaults to None. + proximal_bias (bool, optional): Whether to use proximal bias in self-attention. Defaults to False. + proximal_init (bool, optional): Whether to initialize the key projection weights the same as query projection weights. Defaults to False. + + """ + + def __init__( + self, + channels: int, + out_channels: int, + n_heads: int, + p_dropout: float = 0.0, + window_size: int = None, + heads_share: bool = True, + block_length: int = None, + proximal_bias: bool = False, + proximal_init: bool = False, + ): + super().__init__() + assert ( + channels % n_heads == 0 + ), "Channels must be divisible by the number of heads." + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.k_channels = channels // n_heads + self.window_size = window_size + self.block_length = block_length + self.proximal_bias = proximal_bias + + # Define projections + self.conv_q = torch.nn.Conv1d(channels, channels, 1) + self.conv_k = torch.nn.Conv1d(channels, channels, 1) + self.conv_v = torch.nn.Conv1d(channels, channels, 1) + self.conv_o = torch.nn.Conv1d(channels, out_channels, 1) + + self.drop = torch.nn.Dropout(p_dropout) + + # Relative positional encodings + if window_size: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = torch.nn.Parameter( + torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels) + * rel_stddev, + ) + self.emb_rel_v = torch.nn.Parameter( + torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels) + * rel_stddev, + ) + + # Initialize weights + torch.nn.init.xavier_uniform_(self.conv_q.weight) + torch.nn.init.xavier_uniform_(self.conv_k.weight) + torch.nn.init.xavier_uniform_(self.conv_v.weight) + torch.nn.init.xavier_uniform_(self.conv_o.weight) + + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + # Compute query, key, value projections + q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c) + + # Compute attention + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + # Final output projection + return self.conv_o(x) + + def attention(self, query, key, value, mask=None): + # Reshape and compute scaled dot-product attention + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + + if self.window_size: + assert t_s == t_t, "Relative attention only supports self-attention." + scores += self._compute_relative_scores(query, t_s) + + if self.proximal_bias: + assert t_s == t_t, "Proximal bias only supports self-attention." + scores += self._attention_bias_proximal(t_s).to(scores.device, scores.dtype) + + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length: + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + + # Apply softmax and dropout + p_attn = self.drop(torch.nn.functional.softmax(scores, dim=-1)) + + # Compute attention output + output = torch.matmul(p_attn, value) + + if self.window_size: + output += self._apply_relative_values(p_attn, t_s) + + return output.transpose(2, 3).contiguous().view(b, d, t_t), p_attn + + def _compute_relative_scores(self, query, length): + rel_emb = self._get_relative_embeddings(self.emb_rel_k, length) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), + rel_emb, + ) + return self._relative_position_to_absolute_position(rel_logits) + + def _apply_relative_values(self, p_attn, length): + rel_weights = self._absolute_position_to_relative_position(p_attn) + rel_emb = self._get_relative_embeddings(self.emb_rel_v, length) + return self._matmul_with_relative_values(rel_weights, rel_emb) + + # Helper methods + def _matmul_with_relative_values(self, x, y): + return torch.matmul(x, y.unsqueeze(0)) + + def _matmul_with_relative_keys(self, x, y): + return torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + + def _get_relative_embeddings(self, embeddings, length): + pad_length = max(length - (self.window_size + 1), 0) + start = max((self.window_size + 1) - length, 0) + end = start + 2 * length - 1 + + if pad_length > 0: + embeddings = torch.nn.functional.pad( + embeddings, + convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + return embeddings[:, start:end] + + def _relative_position_to_absolute_position(self, x): + batch, heads, length, _ = x.size() + x = torch.nn.functional.pad( + x, + convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]), + ) + x_flat = x.view(batch, heads, length * 2 * length) + x_flat = torch.nn.functional.pad( + x_flat, + convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]), + ) + return x_flat.view(batch, heads, length + 1, 2 * length - 1)[ + :, + :, + :length, + length - 1 :, + ] + + def _absolute_position_to_relative_position(self, x): + batch, heads, length, _ = x.size() + x = torch.nn.functional.pad( + x, + convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]), + ) + x_flat = x.view(batch, heads, length**2 + length * (length - 1)) + x_flat = torch.nn.functional.pad( + x_flat, + convert_pad_shape([[0, 0], [0, 0], [length, 0]]), + ) + return x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:] + + def _attention_bias_proximal(self, length): + r = torch.arange(length, dtype=torch.float32) + diff = r.unsqueeze(0) - r.unsqueeze(1) + return -torch.log1p(torch.abs(diff)).unsqueeze(0).unsqueeze(0) + + +class FFN(torch.nn.Module): + """ + Feed-forward network module. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + filter_channels (int): Number of filter channels in the convolution layers. + kernel_size (int): Kernel size of the convolution layers. + p_dropout (float, optional): Dropout probability. Defaults to 0.0. + activation (str, optional): Activation function to use. Defaults to None. + causal (bool, optional): Whether to use causal padding in the convolution layers. Defaults to False. + + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + filter_channels: int, + kernel_size: int, + p_dropout: float = 0.0, + activation: str = None, + causal: bool = False, + ): + super().__init__() + self.padding_fn = self._causal_padding if causal else self._same_padding + + self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = torch.nn.Dropout(p_dropout) + + self.activation = activation + + def forward(self, x, x_mask): + x = self.conv_1(self.padding_fn(x * x_mask)) + x = self._apply_activation(x) + x = self.drop(x) + x = self.conv_2(self.padding_fn(x * x_mask)) + return x * x_mask + + def _apply_activation(self, x): + if self.activation == "gelu": + return x * torch.sigmoid(1.702 * x) + return torch.relu(x) + + def _causal_padding(self, x): + pad_l, pad_r = self.conv_1.kernel_size[0] - 1, 0 + return torch.nn.functional.pad( + x, + convert_pad_shape([[0, 0], [0, 0], [pad_l, pad_r]]), + ) + + def _same_padding(self, x): + pad = (self.conv_1.kernel_size[0] - 1) // 2 + return torch.nn.functional.pad( + x, + convert_pad_shape([[0, 0], [0, 0], [pad, pad]]), + ) +) diff --git a/rvc_logic/rvc/lib/algorithm/commons.py b/rvc_logic/rvc/lib/algorithm/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..36e7d92fbaf0a6c9fc904d3d78508ac1f7c1ec49 --- /dev/null +++ b/rvc_logic/rvc/lib/algorithm/commons.py @@ -0,0 +1,151 @@ +from typing import Optional + +import torch + + +def init_weights(m, mean=0.0, std=0.01): + """ + Initialize the weights of a module. + + Args: + m: The module to initialize. + mean: The mean of the normal distribution. + std: The standard deviation of the normal distribution. + + """ + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + """ + Calculate the padding needed for a convolution. + + Args: + kernel_size: The size of the kernel. + dilation: The dilation of the convolution. + + """ + return int((kernel_size * dilation - dilation) / 2) + + +def convert_pad_shape(pad_shape): + """ + Convert the pad shape to a list of integers. + + Args: + pad_shape: The pad shape.. + + """ + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def slice_segments( + x: torch.Tensor, + ids_str: torch.Tensor, + segment_size: int = 4, + dim: int = 2, +): + """ + Slice segments from a tensor, handling tensors with different numbers of dimensions. + + Args: + x (torch.Tensor): The tensor to slice. + ids_str (torch.Tensor): The starting indices of the segments. + segment_size (int, optional): The size of each segment. Defaults to 4. + dim (int, optional): The dimension to slice across (2D or 3D tensors). Defaults to 2. + + """ + if dim == 2: + ret = torch.zeros_like(x[:, :segment_size]) + elif dim == 3: + ret = torch.zeros_like(x[:, :, :segment_size]) + + for i in range(x.size(0)): + idx_str = ids_str[i].item() + idx_end = idx_str + segment_size + if dim == 2: + ret[i] = x[i, idx_str:idx_end] + else: + ret[i] = x[i, :, idx_str:idx_end] + + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + """ + Randomly slice segments from a tensor. + + Args: + x: The tensor to slice. + x_lengths: The lengths of the sequences. + segment_size: The size of each segment. + + """ + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size, dim=3) + return ret, ids_str + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + """ + Fused add tanh sigmoid multiply operation. + + Args: + input_a: The first input tensor. + input_b: The second input tensor. + n_channels: The number of channels. + + """ + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def sequence_mask(length: torch.Tensor, max_length: int | None = None): + """ + Generate a sequence mask. + + Args: + length: The lengths of the sequences. + max_length: The maximum length of the sequences. + + """ + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def grad_norm(parameters, norm_type: float = 2.0): + """ + Calculates norm of parameter gradients + + Args: + parameters: The list of parameters to clip. + norm_type: The type of norm to use for clipping. + + """ + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + + parameters = [p for p in parameters if p.grad is not None] + + if not parameters: + return 0.0 + + return torch.linalg.vector_norm( + torch.stack([p.grad.norm(norm_type) for p in parameters]), + ord=norm_type, + ).item() diff --git a/rvc_logic/rvc/lib/algorithm/discriminators.py b/rvc_logic/rvc/lib/algorithm/discriminators.py new file mode 100644 index 0000000000000000000000000000000000000000..595cadc2624f8b0ad5b5c9d207373bc72ff27af5 --- /dev/null +++ b/rvc_logic/rvc/lib/algorithm/discriminators.py @@ -0,0 +1,267 @@ +import torch +import torch.nn.functional as F +from torch.nn.utils.parametrizations import spectral_norm, weight_norm +from torch.utils.checkpoint import checkpoint + +from rvc_logic.rvc.lib.algorithm.commons import get_padding +from rvc_logic.rvc.lib.algorithm.residuals import LRELU_SLOPE + + +class MultiPeriodDiscriminator(torch.nn.Module): + """ + Multi-period discriminator. + + This class implements a multi-period discriminator, which is used to + discriminate between real and fake audio signals. The discriminator + is composed of a series of convolutional layers that are applied to + the input signal at different periods. + + Args: + use_spectral_norm (bool): Whether to use spectral normalization. + Defaults to False. + + """ + + def __init__( + self, + use_spectral_norm: bool = False, + checkpointing: bool = False, + version: str = "v2", + ): + super().__init__() + + if version == "v1": + periods = [2, 3, 5, 7, 11, 17] + resolutions = [] + elif version == "v2": + periods = [2, 3, 5, 7, 11, 17, 23, 37] + resolutions = [] + elif version == "v3": + periods = [2, 3, 5, 7, 11] + resolutions = [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]] + + self.checkpointing = checkpointing + self.discriminators = torch.nn.ModuleList( + [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods] + + [ + DiscriminatorR(r, use_spectral_norm=use_spectral_norm) + for r in resolutions + ], + ) + + def forward(self, y, y_hat): + y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], [] + for d in self.discriminators: + if self.training and self.checkpointing: + y_d_r, fmap_r = checkpoint(d, y, use_reentrant=False) + y_d_g, fmap_g = checkpoint(d, y_hat, use_reentrant=False) + else: + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + """ + Discriminator for the short-term component. + + This class implements a discriminator for the short-term component + of the audio signal. The discriminator is composed of a series of + convolutional layers that are applied to the input signal. + """ + + def __init__(self, use_spectral_norm: bool = False): + super().__init__() + + norm_f = spectral_norm if use_spectral_norm else weight_norm + self.convs = torch.nn.ModuleList( + [ + norm_f(torch.nn.Conv1d(1, 16, 15, 1, padding=7)), + norm_f(torch.nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(torch.nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(torch.nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(torch.nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(torch.nn.Conv1d(1024, 1024, 5, 1, padding=2)), + ], + ) + self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1)) + self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) + + def forward(self, x): + fmap = [] + for conv in self.convs: + x = self.lrelu(conv(x)) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + """ + Discriminator for the long-term component. + + This class implements a discriminator for the long-term component + of the audio signal. The discriminator is composed of a series of + convolutional layers that are applied to the input signal at a given + period. + + Args: + period (int): Period of the discriminator. + kernel_size (int): Kernel size of the convolutional layers. Defaults to 5. + stride (int): Stride of the convolutional layers. Defaults to 3. + use_spectral_norm (bool): Whether to use spectral normalization. Defaults to False. + + """ + + def __init__( + self, + period: int, + kernel_size: int = 5, + stride: int = 3, + use_spectral_norm: bool = False, + ): + super().__init__() + self.period = period + norm_f = spectral_norm if use_spectral_norm else weight_norm + + in_channels = [1, 32, 128, 512, 1024] + out_channels = [32, 128, 512, 1024, 1024] + strides = [3, 3, 3, 3, 1] + + self.convs = torch.nn.ModuleList( + [ + norm_f( + torch.nn.Conv2d( + in_ch, + out_ch, + (kernel_size, 1), + (s, 1), + padding=(get_padding(kernel_size, 1), 0), + ), + ) + for in_ch, out_ch, s in zip( + in_channels, out_channels, strides, strict=False + ) + ], + ) + + self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) + + def forward(self, x): + fmap = [] + b, c, t = x.shape + if t % self.period != 0: + n_pad = self.period - (t % self.period) + x = torch.nn.functional.pad(x, (0, n_pad), "reflect") + x = x.view(b, c, -1, self.period) + + for conv in self.convs: + x = self.lrelu(conv(x)) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + return x, fmap + + +class DiscriminatorR(torch.nn.Module): + def __init__(self, resolution, use_spectral_norm=False): + super().__init__() + + self.resolution = resolution + self.lrelu_slope = 0.1 + norm_f = spectral_norm if use_spectral_norm else weight_norm + + self.convs = torch.nn.ModuleList( + [ + norm_f( + torch.nn.Conv2d( + 1, + 32, + (3, 9), + padding=(1, 4), + ) + ), + norm_f( + torch.nn.Conv2d( + 32, + 32, + (3, 9), + stride=(1, 2), + padding=(1, 4), + ) + ), + norm_f( + torch.nn.Conv2d( + 32, + 32, + (3, 9), + stride=(1, 2), + padding=(1, 4), + ) + ), + norm_f( + torch.nn.Conv2d( + 32, + 32, + (3, 9), + stride=(1, 2), + padding=(1, 4), + ) + ), + norm_f( + torch.nn.Conv2d( + 32, + 32, + (3, 3), + padding=(1, 1), + ) + ), + ] + ) + self.conv_post = norm_f(torch.nn.Conv2d(32, 1, (3, 3), padding=(1, 1))) + + def forward(self, x): + fmap = [] + + x = self.spectrogram(x).unsqueeze(1) + + for layer in self.convs: + x = F.leaky_relu(layer(x), self.lrelu_slope) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + + return torch.flatten(x, 1, -1), fmap + + def spectrogram(self, x): + n_fft, hop_length, win_length = self.resolution + pad = int((n_fft - hop_length) / 2) + x = F.pad( + x, + (pad, pad), + mode="reflect", + ).squeeze(1) + x = torch.stft( + x, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=torch.ones(win_length, device=x.device), + center=False, + return_complex=True, + ) + + mag = torch.norm(torch.view_as_real(x), p=2, dim=-1) # [B, F, TT] + + return mag + mag diff --git a/rvc_logic/rvc/lib/algorithm/encoders.py b/rvc_logic/rvc/lib/algorithm/encoders.py new file mode 100644 index 0000000000000000000000000000000000000000..7d04c9ce6b3db8f47d97deb372e2a2e1d32f936d --- /dev/null +++ b/rvc_logic/rvc/lib/algorithm/encoders.py @@ -0,0 +1,228 @@ +import logging +import math + +import torch + +from rvc_logic.rvc.lib.algorithm.attentions import FFN, MultiHeadAttention +from rvc_logic.rvc.lib.algorithm.commons import sequence_mask +from rvc_logic.rvc.lib.algorithm.modules import WaveNet +from rvc_logic.rvc.lib.algorithm.normalization import LayerNorm + +logger = logging.getLogger(__name__) + + +class Encoder(torch.nn.Module): + """ + Encoder module for the Transformer model. + + Args: + hidden_channels (int): Number of hidden channels in the encoder. + filter_channels (int): Number of filter channels in the feed-forward network. + n_heads (int): Number of attention heads. + n_layers (int): Number of encoder layers. + kernel_size (int, optional): Kernel size of the convolution layers in the feed-forward network. Defaults to 1. + p_dropout (float, optional): Dropout probability. Defaults to 0.0. + window_size (int, optional): Window size for relative positional encoding. Defaults to 10. + + """ + + def __init__( + self, + hidden_channels: int, + filter_channels: int, + n_heads: int, + n_layers: int, + kernel_size: int = 1, + p_dropout: float = 0.0, + window_size: int = 10, + ): + super().__init__() + + self.hidden_channels = hidden_channels + self.n_layers = n_layers + self.drop = torch.nn.Dropout(p_dropout) + + self.attn_layers = torch.nn.ModuleList( + [ + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + for _ in range(n_layers) + ], + ) + self.norm_layers_1 = torch.nn.ModuleList( + [LayerNorm(hidden_channels) for _ in range(n_layers)], + ) + self.ffn_layers = torch.nn.ModuleList( + [ + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + for _ in range(n_layers) + ], + ) + self.norm_layers_2 = torch.nn.ModuleList( + [LayerNorm(hidden_channels) for _ in range(n_layers)], + ) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + + for i in range(self.n_layers): + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + + return x * x_mask + + +class TextEncoder(torch.nn.Module): + """ + Text Encoder with configurable embedding dimension. + + Args: + out_channels (int): Output channels of the encoder. + hidden_channels (int): Hidden channels of the encoder. + filter_channels (int): Filter channels of the encoder. + n_heads (int): Number of attention heads. + n_layers (int): Number of encoder layers. + kernel_size (int): Kernel size of the convolutional layers. + p_dropout (float): Dropout probability. + embedding_dim (int): Embedding dimension for phone embeddings (v1 = 256, v2 = 768). + f0 (bool, optional): Whether to use F0 embedding. Defaults to True. + + """ + + def __init__( + self, + out_channels: int, + hidden_channels: int, + filter_channels: int, + n_heads: int, + n_layers: int, + kernel_size: int, + p_dropout: float, + embedding_dim: int, + f0: bool = True, + ): + super().__init__() + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.emb_phone = torch.nn.Linear(embedding_dim, hidden_channels) + self.lrelu = torch.nn.LeakyReLU(0.1, inplace=True) + self.emb_pitch = torch.nn.Embedding(256, hidden_channels) if f0 else None + logger.info("hidden_channels: %d", hidden_channels) + + self.encoder = Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, + phone: torch.Tensor, + pitch: torch.Tensor | None, + lengths: torch.Tensor, + ): + x = self.emb_phone(phone) + if pitch is not None and self.emb_pitch: + x += self.emb_pitch(pitch) + + x *= math.sqrt(self.hidden_channels) + x = self.lrelu(x) + x = x.transpose(1, -1) # [B, H, T] + + x_mask = sequence_mask(lengths, x.size(2)).unsqueeze(1).to(x.dtype) + x = self.encoder(x, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class PosteriorEncoder(torch.nn.Module): + """ + Posterior Encoder for inferring latent representation. + + Args: + in_channels (int): Number of channels in the input. + out_channels (int): Number of channels in the output. + hidden_channels (int): Number of hidden channels in the encoder. + kernel_size (int): Kernel size of the convolutional layers. + dilation_rate (int): Dilation rate of the convolutional layers. + n_layers (int): Number of layers in the encoder. + gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0. + + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + hidden_channels: int, + kernel_size: int, + dilation_rate: int, + n_layers: int, + gin_channels: int = 0, + ): + super().__init__() + self.out_channels = out_channels + self.pre = torch.nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = WaveNet( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, + x: torch.Tensor, + x_lengths: torch.Tensor, + g: torch.Tensor | None = None, + ): + x_mask = sequence_mask(x_lengths, x.size(2)).unsqueeze(1).to(x.dtype) + + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + + z = m + torch.randn_like(m) * torch.exp(logs) + z *= x_mask + + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.enc._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc) + return self +eturn self diff --git a/rvc_logic/rvc/lib/algorithm/generators/__init__.py b/rvc_logic/rvc/lib/algorithm/generators/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/rvc_logic/rvc/lib/algorithm/generators/hifigan.py b/rvc_logic/rvc/lib/algorithm/generators/hifigan.py new file mode 100644 index 0000000000000000000000000000000000000000..eb9e056d9d4c74f61dca9d7404864f009a5833f7 --- /dev/null +++ b/rvc_logic/rvc/lib/algorithm/generators/hifigan.py @@ -0,0 +1,249 @@ +from typing import Optional + +import numpy as np + +import torch +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm + +from rvc_logic.rvc.lib.algorithm.commons import init_weights +from rvc_logic.rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock + + +class HiFiGANGenerator(torch.nn.Module): + """ + HiFi-GAN Generator module for audio synthesis. + + This module implements the generator part of the HiFi-GAN architecture, + which uses transposed convolutions for upsampling and residual blocks for + refining the audio output. It can also incorporate global conditioning. + + Args: + initial_channel (int): Number of input channels to the initial convolutional layer. + resblock_kernel_sizes (list): List of kernel sizes for the residual blocks. + resblock_dilation_sizes (list): List of lists of dilation rates for the residual blocks, corresponding to each kernel size. + upsample_rates (list): List of upsampling factors for each upsampling layer. + upsample_initial_channel (int): Number of output channels from the initial convolutional layer, which is also the input to the first upsampling layer. + upsample_kernel_sizes (list): List of kernel sizes for the transposed convolutional layers used for upsampling. + gin_channels (int, optional): Number of input channels for the global conditioning. If 0, no global conditioning is used. Defaults to 0. + + """ + + def __init__( + self, + initial_channel: int, + resblock_kernel_sizes: list, + resblock_dilation_sizes: list, + upsample_rates: list, + upsample_initial_channel: int, + upsample_kernel_sizes: list, + gin_channels: int = 0, + ): + super().__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = torch.nn.Conv1d( + initial_channel, + upsample_initial_channel, + 7, + 1, + padding=3, + ) + + self.ups = torch.nn.ModuleList() + self.resblocks = torch.nn.ModuleList() + + for i, (u, k) in enumerate( + zip(upsample_rates, upsample_kernel_sizes, strict=False), + ): + self.ups.append( + weight_norm( + torch.nn.ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ), + ), + ) + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes, strict=False), + ): + self.resblocks.append(ResBlock(ch, k, d)) + + self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x: torch.Tensor, g: torch.Tensor | None = None): + # new tensor + x = self.conv_pre(x) + + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + # in-place call + x = torch.nn.functional.leaky_relu(x) + x = self.conv_post(x) + # in-place call + x = torch.tanh(x) + + return x + + def __prepare_scriptable__(self): + for l in self.ups_and_resblocks: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGenerator(torch.nn.Module): + """ + Sine wave generator with optional harmonic overtones and noise. + + This module generates sine waves for a fundamental frequency and its harmonics. + It can also add Gaussian noise and apply a voiced/unvoiced mask. + + Args: + sampling_rate (int): The sampling rate of the audio in Hz. + num_harmonics (int, optional): The number of harmonic overtones to generate. Defaults to 0. + sine_amplitude (float, optional): The amplitude of the sine wave components. Defaults to 0.1. + noise_stddev (float, optional): The standard deviation of the additive Gaussian noise. Defaults to 0.003. + voiced_threshold (float, optional): The threshold for the fundamental frequency (F0) to determine if a frame is voiced. Defaults to 0.0. + + """ + + def __init__( + self, + sampling_rate: int, + num_harmonics: int = 0, + sine_amplitude: float = 0.1, + noise_stddev: float = 0.003, + voiced_threshold: float = 0.0, + ): + super().__init__() + self.sampling_rate = sampling_rate + self.num_harmonics = num_harmonics + self.sine_amplitude = sine_amplitude + self.noise_stddev = noise_stddev + self.voiced_threshold = voiced_threshold + self.waveform_dim = self.num_harmonics + 1 # fundamental + harmonics + + def _compute_voiced_unvoiced(self, f0: torch.Tensor): + """ + Generates a binary mask indicating voiced/unvoiced frames based on the fundamental frequency. + + Args: + f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length). + + """ + uv_mask = (f0 > self.voiced_threshold).float() + return uv_mask + + def _generate_sine_wave(self, f0: torch.Tensor, upsampling_factor: int): + """ + Generates sine waves for the fundamental frequency and its harmonics. + + Args: + f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length, 1). + upsampling_factor (int): The factor by which to upsample the sine wave. + + """ + batch_size, length, _ = f0.shape + + # Create an upsampling grid + upsampling_grid = torch.arange( + 1, + upsampling_factor + 1, + dtype=f0.dtype, + device=f0.device, + ) + + # Calculate phase increments + phase_increments = (f0 / self.sampling_rate) * upsampling_grid + phase_remainder = torch.fmod(phase_increments[:, :-1, -1:] + 0.5, 1.0) - 0.5 + cumulative_phase = phase_remainder.cumsum(dim=1).fmod(1.0).to(f0.dtype) + phase_increments += torch.nn.functional.pad( + cumulative_phase, + (0, 0, 1, 0), + mode="constant", + ) + + # Reshape to match the sine wave shape + phase_increments = phase_increments.reshape(batch_size, -1, 1) + + # Scale for harmonics + harmonic_scale = torch.arange( + 1, + self.waveform_dim + 1, + dtype=f0.dtype, + device=f0.device, + ).reshape(1, 1, -1) + phase_increments *= harmonic_scale + + # Add random phase offset (except for the fundamental) + random_phase = torch.rand(1, 1, self.waveform_dim, device=f0.device) + random_phase[..., 0] = 0 # Fundamental frequency has no random offset + phase_increments += random_phase + + # Generate sine waves + sine_waves = torch.sin(2 * np.pi * phase_increments) + return sine_waves + + def forward(self, f0: torch.Tensor, upsampling_factor: int): + with torch.no_grad(): + # Expand `f0` to include waveform dimensions + f0 = f0.unsqueeze(-1) + + # Generate sine waves + sine_waves = ( + self._generate_sine_wave(f0, upsampling_factor) * self.sine_amplitude + ) + + # Compute voiced/unvoiced mask + voiced_mask = self._compute_voiced_unvoiced(f0) + + # Upsample voiced/unvoiced mask + voiced_mask = torch.nn.functional.interpolate( + voiced_mask.transpose(2, 1), + scale_factor=float(upsampling_factor), + mode="nearest", + ).transpose(2, 1) + + # Compute noise amplitude + noise_amplitude = voiced_mask * self.noise_stddev + (1 - voiced_mask) * ( + self.sine_amplitude / 3 + ) + + # Add Gaussian noise + noise = noise_amplitude * torch.randn_like(sine_waves) + + # Combine sine waves and noise + sine_waveforms = sine_waves * voiced_mask + noise + + return sine_waveforms, voiced_mask, noise +oise diff --git a/rvc_logic/rvc/lib/algorithm/generators/hifigan_mrf.py b/rvc_logic/rvc/lib/algorithm/generators/hifigan_mrf.py new file mode 100644 index 0000000000000000000000000000000000000000..1841a9339f0e9f007281c2fda49f705cb1ac367a --- /dev/null +++ b/rvc_logic/rvc/lib/algorithm/generators/hifigan_mrf.py @@ -0,0 +1,411 @@ +from typing import Optional + +import math + +import numpy as np + +import torch +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.utils.checkpoint import checkpoint + +LRELU_SLOPE = 0.1 + + +class MRFLayer(torch.nn.Module): + """ + A single layer of the Multi-Receptive Field (MRF) block. + + This layer consists of two 1D convolutional layers with weight normalization + and Leaky ReLU activation in between. The first convolution has a dilation, + while the second has a dilation of 1. A skip connection is added from the input + to the output. + + Args: + channels (int): The number of input and output channels. + kernel_size (int): The kernel size of the convolutional layers. + dilation (int): The dilation rate for the first convolutional layer. + + """ + + def __init__(self, channels, kernel_size, dilation): + super().__init__() + self.conv1 = weight_norm( + torch.nn.Conv1d( + channels, + channels, + kernel_size, + padding=(kernel_size * dilation - dilation) // 2, + dilation=dilation, + ), + ) + self.conv2 = weight_norm( + torch.nn.Conv1d( + channels, + channels, + kernel_size, + padding=kernel_size // 2, + dilation=1, + ), + ) + + def forward(self, x: torch.Tensor): + y = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) + y = self.conv1(y) + y = torch.nn.functional.leaky_relu(y, LRELU_SLOPE) + y = self.conv2(y) + return x + y + + def remove_weight_norm(self): + remove_weight_norm(self.conv1) + remove_weight_norm(self.conv2) + + +class MRFBlock(torch.nn.Module): + """ + A Multi-Receptive Field (MRF) block. + + This block consists of multiple MRFLayers with different dilation rates. + It applies each layer sequentially to the input. + + Args: + channels (int): The number of input and output channels for the MRFLayers. + kernel_size (int): The kernel size for the convolutional layers in the MRFLayers. + dilations (list[int]): A list of dilation rates for the MRFLayers. + + """ + + def __init__(self, channels, kernel_size, dilations): + super().__init__() + self.layers = torch.nn.ModuleList() + for dilation in dilations: + self.layers.append(MRFLayer(channels, kernel_size, dilation)) + + def forward(self, x: torch.Tensor): + for layer in self.layers: + x = layer(x) + return x + + def remove_weight_norm(self): + for layer in self.layers: + layer.remove_weight_norm() + + +class SineGenerator(torch.nn.Module): + """ + Definition of sine generator + + Generates sine waveforms with optional harmonics and additive noise. + Can be used to create harmonic noise source for neural vocoders. + + Args: + samp_rate (int): Sampling rate in Hz. + harmonic_num (int): Number of harmonic overtones (default 0). + sine_amp (float): Amplitude of sine-waveform (default 0.1). + noise_std (float): Standard deviation of Gaussian noise (default 0.003). + voiced_threshold (float): F0 threshold for voiced/unvoiced classification (default 0). + + """ + + def __init__( + self, + samp_rate: int, + harmonic_num: int = 0, + sine_amp: float = 0.1, + noise_std: float = 0.003, + voiced_threshold: float = 0, + ): + super().__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0: torch.Tensor): + """ + Generates voiced/unvoiced (UV) signal based on the fundamental frequency (F0). + + Args: + f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length, 1). + + """ + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def _f02sine(self, f0_values: torch.Tensor): + """ + Generates sine waveforms based on the fundamental frequency (F0) and its harmonics. + + Args: + f0_values (torch.Tensor): Tensor of fundamental frequency and its harmonics, + shape (batch_size, length, dim), where dim indicates + the fundamental tone and overtones. + + """ + # convert to F0 in rad. The integer part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand( + f0_values.shape[0], + f0_values.shape[2], + device=f0_values.device, + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + tmp_over_one = torch.cumsum(rad_values, 1) % 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) + + return sines + + def forward(self, f0: torch.Tensor): + with torch.no_grad(): + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) + + sine_waves = self._f02sine(f0_buf) * self.sine_amp + + uv = self._f02uv(f0) + + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """ + Generates harmonic and noise source features. + + This module uses the SineGenerator to create harmonic signals based on the + fundamental frequency (F0) and merges them into a single excitation signal. + + Args: + sample_rate (int): Sampling rate in Hz. + harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0. + sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1. + add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003. + voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0. + + """ + + def __init__( + self, + sampling_rate: int, + harmonic_num: int = 0, + sine_amp: float = 0.1, + add_noise_std: float = 0.003, + voiced_threshold: float = 0, + ): + super().__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGenerator( + sampling_rate, + harmonic_num, + sine_amp, + add_noise_std, + voiced_threshold, + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x: torch.Tensor): + sine_wavs, uv, _ = self.l_sin_gen(x) + sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + return sine_merge, None, None + + +class HiFiGANMRFGenerator(torch.nn.Module): + """ + HiFi-GAN generator with Multi-Receptive Field (MRF) blocks. + + This generator takes an input feature sequence and fundamental frequency (F0) + as input and generates an audio waveform. It utilizes transposed convolutions + for upsampling and MRF blocks for feature refinement. It can also condition + on global conditioning features. + + Args: + in_channel (int): Number of input channels. + upsample_initial_channel (int): Number of channels after the initial convolution. + upsample_rates (list[int]): List of upsampling rates for the transposed convolutions. + upsample_kernel_sizes (list[int]): List of kernel sizes for the transposed convolutions. + resblock_kernel_sizes (list[int]): List of kernel sizes for the convolutional layers in the MRF blocks. + resblock_dilations (list[list[int]]): List of lists of dilation rates for the MRF blocks. + gin_channels (int): Number of global conditioning input channels (0 if no global conditioning). + sample_rate (int): Sampling rate of the audio. + harmonic_num (int): Number of harmonics to generate. + checkpointing (bool): Whether to use checkpointing to save memory during training (default: False). + + """ + + def __init__( + self, + in_channel: int, + upsample_initial_channel: int, + upsample_rates: list[int], + upsample_kernel_sizes: list[int], + resblock_kernel_sizes: list[int], + resblock_dilations: list[list[int]], + gin_channels: int, + sample_rate: int, + harmonic_num: int, + checkpointing: bool = False, + ): + super().__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.checkpointing = checkpointing + + self.f0_upsample = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num) + + self.conv_pre = weight_norm( + torch.nn.Conv1d( + in_channel, + upsample_initial_channel, + kernel_size=7, + stride=1, + padding=3, + ), + ) + self.upsamples = torch.nn.ModuleList() + self.noise_convs = torch.nn.ModuleList() + + stride_f0s = [ + math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1 + for i in range(len(upsample_rates)) + ] + + for i, (u, k) in enumerate( + zip(upsample_rates, upsample_kernel_sizes, strict=False), + ): + # handling odd upsampling rates + if u % 2 == 0: + # old method + padding = (k - u) // 2 + else: + padding = u // 2 + u % 2 + + self.upsamples.append( + weight_norm( + torch.nn.ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + kernel_size=k, + stride=u, + padding=padding, + output_padding=u % 2, + ), + ), + ) + """ handling odd upsampling rates + # s k p + # 40 80 20 + # 32 64 16 + # 4 8 2 + # 2 3 1 + # 63 125 31 + # 9 17 4 + # 3 5 1 + # 1 1 0 + """ + stride = stride_f0s[i] + kernel = 1 if stride == 1 else stride * 2 - stride % 2 + padding = 0 if stride == 1 else (kernel - stride) // 2 + + self.noise_convs.append( + torch.nn.Conv1d( + 1, + upsample_initial_channel // (2 ** (i + 1)), + kernel_size=kernel, + stride=stride, + padding=padding, + ), + ) + self.mrfs = torch.nn.ModuleList() + for i in range(len(self.upsamples)): + channel = upsample_initial_channel // (2 ** (i + 1)) + self.mrfs.append( + torch.nn.ModuleList( + [ + MRFBlock(channel, kernel_size=k, dilations=d) + for k, d in zip( + resblock_kernel_sizes, + resblock_dilations, + strict=False, + ) + ], + ), + ) + self.conv_post = weight_norm( + torch.nn.Conv1d(channel, 1, kernel_size=7, stride=1, padding=3), + ) + if gin_channels != 0: + self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward( + self, + x: torch.Tensor, + f0: torch.Tensor, + g: torch.Tensor | None = None, + ): + f0 = self.f0_upsample(f0[:, None, :]).transpose(-1, -2) + har_source, _, _ = self.m_source(f0) + har_source = har_source.transpose(-1, -2) + x = self.conv_pre(x) + + if g is not None: + x = x + self.cond(g) + + for ups, mrf, noise_conv in zip( + self.upsamples, + self.mrfs, + self.noise_convs, + strict=False, + ): + x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) + + if self.training and self.checkpointing: + x = checkpoint(ups, x, use_reentrant=False) + x = x + noise_conv(har_source) + xs = sum([checkpoint(layer, x, use_reentrant=False) for layer in mrf]) + else: + x = ups(x) + x = x + noise_conv(har_source) + xs = sum([layer(x) for layer in mrf]) + x = xs / self.num_kernels + + x = torch.nn.functional.leaky_relu(x) + x = torch.tanh(self.conv_post(x)) + + return x + + def remove_weight_norm(self): + remove_weight_norm(self.conv_pre) + for up in self.upsamples: + remove_weight_norm(up) + for mrf in self.mrfs: + mrf.remove_weight_norm() + remove_weight_norm(self.conv_post) diff --git a/rvc_logic/rvc/lib/algorithm/generators/hifigan_nsf.py b/rvc_logic/rvc/lib/algorithm/generators/hifigan_nsf.py new file mode 100644 index 0000000000000000000000000000000000000000..b7cf7231ad9323120eb0ff2c8cf7f38aae0fe4f6 --- /dev/null +++ b/rvc_logic/rvc/lib/algorithm/generators/hifigan_nsf.py @@ -0,0 +1,258 @@ +from typing import Optional + +import math + +import torch +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.utils.checkpoint import checkpoint + +from rvc_logic.rvc.lib.algorithm.commons import init_weights +from rvc_logic.rvc.lib.algorithm.generators.hifigan import SineGenerator +from rvc_logic.rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock + + +class SourceModuleHnNSF(torch.nn.Module): + """ + Source Module for generating harmonic and noise components for audio synthesis. + + This module generates a harmonic source signal using sine waves and adds + optional noise. It's often used in neural vocoders as a source of excitation. + + Args: + sample_rate (int): Sampling rate of the audio in Hz. + harmonic_num (int, optional): Number of harmonic overtones to generate above the fundamental frequency (F0). Defaults to 0. + sine_amp (float, optional): Amplitude of the sine wave components. Defaults to 0.1. + add_noise_std (float, optional): Standard deviation of the additive white Gaussian noise. Defaults to 0.003. + voiced_threshod (float, optional): Threshold for the fundamental frequency (F0) to determine if a frame is voiced. If F0 is below this threshold, it's considered unvoiced. Defaults to 0. + + """ + + def __init__( + self, + sample_rate: int, + harmonic_num: int = 0, + sine_amp: float = 0.1, + add_noise_std: float = 0.003, + voiced_threshod: float = 0, + ): + super().__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + self.l_sin_gen = SineGenerator( + sample_rate, + harmonic_num, + sine_amp, + add_noise_std, + voiced_threshod, + ) + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x: torch.Tensor, upsample_factor: int = 1): + sine_wavs, uv, _ = self.l_sin_gen(x, upsample_factor) + sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None + + +class HiFiGANNSFGenerator(torch.nn.Module): + """ + Generator module based on the Neural Source Filter (NSF) architecture. + + This generator synthesizes audio by first generating a source excitation signal + (harmonic and noise) and then filtering it through a series of upsampling and + residual blocks. Global conditioning can be applied to influence the generation. + + Args: + initial_channel (int): Number of input channels to the initial convolutional layer. + resblock_kernel_sizes (list): List of kernel sizes for the residual blocks. + resblock_dilation_sizes (list): List of lists of dilation rates for the residual blocks, corresponding to each kernel size. + upsample_rates (list): List of upsampling factors for each upsampling layer. + upsample_initial_channel (int): Number of output channels from the initial convolutional layer, which is also the input to the first upsampling layer. + upsample_kernel_sizes (list): List of kernel sizes for the transposed convolutional layers used for upsampling. + gin_channels (int): Number of input channels for the global conditioning. If 0, no global conditioning is used. + sr (int): Sampling rate of the audio. + checkpointing (bool, optional): Whether to use gradient checkpointing to save memory during training. Defaults to False. + + """ + + def __init__( + self, + initial_channel: int, + resblock_kernel_sizes: list, + resblock_dilation_sizes: list, + upsample_rates: list, + upsample_initial_channel: int, + upsample_kernel_sizes: list, + gin_channels: int, + sr: int, + checkpointing: bool = False, + ): + super().__init__() + + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.checkpointing = checkpointing + self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0) + + self.conv_pre = torch.nn.Conv1d( + initial_channel, + upsample_initial_channel, + 7, + 1, + padding=3, + ) + + self.ups = torch.nn.ModuleList() + self.noise_convs = torch.nn.ModuleList() + + channels = [ + upsample_initial_channel // (2 ** (i + 1)) + for i in range(len(upsample_rates)) + ] + stride_f0s = [ + math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1 + for i in range(len(upsample_rates)) + ] + + for i, (u, k) in enumerate( + zip(upsample_rates, upsample_kernel_sizes, strict=False), + ): + # handling odd upsampling rates + if u % 2 == 0: + # old method + padding = (k - u) // 2 + else: + padding = u // 2 + u % 2 + + self.ups.append( + weight_norm( + torch.nn.ConvTranspose1d( + upsample_initial_channel // (2**i), + channels[i], + k, + u, + padding=padding, + output_padding=u % 2, + ), + ), + ) + """ handling odd upsampling rates + # s k p + # 40 80 20 + # 32 64 16 + # 4 8 2 + # 2 3 1 + # 63 125 31 + # 9 17 4 + # 3 5 1 + # 1 1 0 + """ + stride = stride_f0s[i] + kernel = 1 if stride == 1 else stride * 2 - stride % 2 + padding = 0 if stride == 1 else (kernel - stride) // 2 + + self.noise_convs.append( + torch.nn.Conv1d( + 1, + channels[i], + kernel_size=kernel, + stride=stride, + padding=padding, + ), + ) + + self.resblocks = torch.nn.ModuleList( + [ + ResBlock(channels[i], k, d) + for i in range(len(self.ups)) + for k, d in zip( + resblock_kernel_sizes, + resblock_dilation_sizes, + strict=False, + ) + ], + ) + + self.conv_post = torch.nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = math.prod(upsample_rates) + self.lrelu_slope = LRELU_SLOPE + + def forward( + self, + x: torch.Tensor, + f0: torch.Tensor, + g: torch.Tensor | None = None, + ): + har_source, _, _ = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + # new tensor + x = self.conv_pre(x) + + if g is not None: + x = x + self.cond(g) + + for i, (ups, noise_convs) in enumerate( + zip(self.ups, self.noise_convs, strict=False), + ): + x = torch.nn.functional.leaky_relu(x, self.lrelu_slope) + # Apply upsampling layer + if self.training and self.checkpointing: + x = checkpoint(ups, x, use_reentrant=False) + x = x + noise_convs(har_source) + xs = sum( + [ + checkpoint(resblock, x, use_reentrant=False) + for j, resblock in enumerate(self.resblocks) + if j in range(i * self.num_kernels, (i + 1) * self.num_kernels) + ], + ) + else: + x = ups(x) + x = x + noise_convs(har_source) + xs = sum( + [ + resblock(x) + for j, resblock in enumerate(self.resblocks) + if j in range(i * self.num_kernels, (i + 1) * self.num_kernels) + ], + ) + x = xs / self.num_kernels + + x = torch.nn.functional.leaky_relu(x) + x = torch.tanh(self.conv_post(x)) + + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + def __prepare_scriptable__(self): + for l in self.ups: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + remove_weight_norm(l) + for l in self.resblocks: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + remove_weight_norm(l) + return self +rn self diff --git a/rvc_logic/rvc/lib/algorithm/generators/refinegan.py b/rvc_logic/rvc/lib/algorithm/generators/refinegan.py new file mode 100644 index 0000000000000000000000000000000000000000..b49343cfd042cc1edfad54875542583381137526 --- /dev/null +++ b/rvc_logic/rvc/lib/algorithm/generators/refinegan.py @@ -0,0 +1,462 @@ +import numpy as np + +import torch +import torchaudio +from torch import nn +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.utils.checkpoint import checkpoint + +from rvc_logic.rvc.lib.algorithm.commons import get_padding, init_weights + + +class ResBlock(nn.Module): + """ + Residual block with multiple dilated convolutions. + + This block applies a sequence of dilated convolutional layers with Leaky ReLU activation. + It's designed to capture information at different scales due to the varying dilation rates. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (int, optional): Kernel size for the convolutional layers. Defaults to 7. + dilation (tuple[int], optional): Tuple of dilation rates for the convolutional layers. Defaults to (1, 3, 5). + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2. + + """ + + def __init__( + self, + channels: int, + kernel_size: int = 7, + dilation: tuple[int] = (1, 3, 5), + leaky_relu_slope: float = 0.2, + ): + super().__init__() + + self.leaky_relu_slope = leaky_relu_slope + + self.convs1 = nn.ModuleList( + [ + weight_norm( + nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + dilation=d, + padding=get_padding(kernel_size, d), + ), + ) + for d in dilation + ], + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + dilation=1, + padding=get_padding(kernel_size, 1), + ), + ) + for d in dilation + ], + ) + self.convs2.apply(init_weights) + + def forward(self, x: torch.Tensor): + for c1, c2 in zip(self.convs1, self.convs2, strict=False): + xt = F.leaky_relu(x, self.leaky_relu_slope) + xt = c1(xt) + xt = F.leaky_relu(xt, self.leaky_relu_slope) + xt = c2(xt) + x = xt + x + + return x + + def remove_weight_norm(self): + for c1, c2 in zip(self.convs1, self.convs2, strict=False): + remove_weight_norm(c1) + remove_weight_norm(c2) + + +class AdaIN(nn.Module): + """ + Adaptive Instance Normalization layer. + + This layer applies a scaling factor to the input based on a learnable weight. + + Args: + channels (int): Number of input channels. + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation applied after scaling. Defaults to 0.2. + + """ + + def __init__( + self, + *, + channels: int, + leaky_relu_slope: float = 0.2, + ): + super().__init__() + + self.weight = nn.Parameter(torch.ones(channels) * 1e-4) + # safe to use in-place as it is used on a new x+gaussian tensor + self.activation = nn.LeakyReLU(leaky_relu_slope) + + def forward(self, x: torch.Tensor): + gaussian = torch.randn_like(x) * self.weight[None, :, None] + + return self.activation(x + gaussian) + + +class ParallelResBlock(nn.Module): + """ + Parallel residual block that applies multiple residual blocks with different kernel sizes in parallel. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_sizes (tuple[int], optional): Tuple of kernel sizes for the parallel residual blocks. Defaults to (3, 7, 11). + dilation (tuple[int], optional): Tuple of dilation rates for the convolutional layers within the residual blocks. Defaults to (1, 3, 5). + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2. + + """ + + def __init__( + self, + *, + in_channels: int, + out_channels: int, + kernel_sizes: tuple[int] = (3, 7, 11), + dilation: tuple[int] = (1, 3, 5), + leaky_relu_slope: float = 0.2, + ): + super().__init__() + + self.in_channels = in_channels + self.out_channels = out_channels + + self.input_conv = nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=7, + stride=1, + padding=3, + ) + + self.input_conv.apply(init_weights) + + self.blocks = nn.ModuleList( + [ + nn.Sequential( + AdaIN(channels=out_channels), + ResBlock( + out_channels, + kernel_size=kernel_size, + dilation=dilation, + leaky_relu_slope=leaky_relu_slope, + ), + AdaIN(channels=out_channels), + ) + for kernel_size in kernel_sizes + ], + ) + + def forward(self, x: torch.Tensor): + x = self.input_conv(x) + return torch.stack([block(x) for block in self.blocks], dim=0).mean(dim=0) + + def remove_weight_norm(self): + remove_weight_norm(self.input_conv) + for block in self.blocks: + block[1].remove_weight_norm() + + +class SineGenerator(nn.Module): + """ + Definition of sine generator + + Generates sine waveforms with optional harmonics and additive noise. + Can be used to create harmonic noise source for neural vocoders. + + Args: + samp_rate (int): Sampling rate in Hz. + harmonic_num (int): Number of harmonic overtones (default 0). + sine_amp (float): Amplitude of sine-waveform (default 0.1). + noise_std (float): Standard deviation of Gaussian noise (default 0.003). + voiced_threshold (float): F0 threshold for voiced/unvoiced classification (default 0). + + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + ): + super().__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + self.merge = nn.Sequential( + nn.Linear(self.dim, 1, bias=False), + nn.Tanh(), + ) + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def _f02sine(self, f0_values): + """ + f0_values: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + # convert to F0 in rad. The integer part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand( + f0_values.shape[0], + f0_values.shape[2], + device=f0_values.device, + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + tmp_over_one = torch.cumsum(rad_values, 1) % 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) + + return sines + + def forward(self, f0): + with torch.no_grad(): + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) + + sine_waves = self._f02sine(f0_buf) * self.sine_amp + + uv = self._f02uv(f0) + + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + sine_waves = sine_waves * uv + noise + + # merge with grad + return self.merge(sine_waves) + + +class RefineGANGenerator(nn.Module): + """ + RefineGAN generator for audio synthesis. + + This generator uses a combination of downsampling, residual blocks, and parallel residual blocks + to refine an input mel-spectrogram and fundamental frequency (F0) into an audio waveform. + It can also incorporate global conditioning. + + Args: + sample_rate (int, optional): Sampling rate of the audio. Defaults to 44100. + downsample_rates (tuple[int], optional): Downsampling rates for the downsampling blocks. Defaults to (2, 2, 8, 8). + upsample_rates (tuple[int], optional): Upsampling rates for the upsampling blocks. Defaults to (8, 8, 2, 2). + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2. + num_mels (int, optional): Number of mel-frequency bins in the input mel-spectrogram. Defaults to 128. + start_channels (int, optional): Number of channels in the initial convolutional layer. Defaults to 16. + gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 256. + checkpointing (bool, optional): Whether to use checkpointing for memory efficiency. Defaults to False. + + """ + + def __init__( + self, + *, + sample_rate: int = 44100, + downsample_rates: tuple[int] = (2, 2, 8, 8), # unused + upsample_rates: tuple[int] = (8, 8, 2, 2), + leaky_relu_slope: float = 0.2, + num_mels: int = 128, + start_channels: int = 16, # unused + gin_channels: int = 256, + checkpointing: bool = False, + upsample_initial_channel=512, + ): + super().__init__() + self.upsample_rates = upsample_rates + self.leaky_relu_slope = leaky_relu_slope + self.checkpointing = checkpointing + + self.upp = np.prod(upsample_rates) + self.m_source = SineGenerator(sample_rate) + + # expanded f0 sinegen -> match mel_conv + # (8, 1, 17280) -> (8, 16, 17280) + self.pre_conv = weight_norm( + nn.Conv1d(1, 16, 7, 1, padding=3), + ) + + # (8, 16, 17280) = 4th upscale + # (8, 32, 8640) = 3rd upscale + # (8, 64, 4320) = 2nd upscale + # (8, 128, 432) = 1st upscale + # (8, 256, 36) merged to mel + + # f0 downsampling and upchanneling + channels = start_channels + size = self.upp + self.downsample_blocks = nn.ModuleList([]) + self.df0 = [] + for i, u in enumerate(upsample_rates): + + new_size = int(size / upsample_rates[-i - 1]) + # T dimension factors for torchaudio.functional.resample + self.df0.append([size, new_size]) + size = new_size + + new_channels = channels * 2 + self.downsample_blocks.append( + weight_norm( + nn.Conv1d( + channels, + new_channels, + 7, + 1, + padding=3, + ), + ), + ) + channels = new_channels + + # mel handling + channels = upsample_initial_channel + + self.mel_conv = weight_norm( + nn.Conv1d(num_mels, channels // 2, 7, 1, padding=3), + ) + + self.mel_conv.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(256, channels // 2, 1) + + self.upsample_blocks = nn.ModuleList([]) + self.upsample_conv_blocks = nn.ModuleList([]) + + for rate in upsample_rates: + new_channels = channels // 2 + + self.upsample_blocks.append(nn.Upsample(scale_factor=rate, mode="linear")) + + self.upsample_conv_blocks.append( + ParallelResBlock( + in_channels=channels + channels // 4, + out_channels=new_channels, + kernel_sizes=(3, 7, 11), + dilation=(1, 3, 5), + leaky_relu_slope=leaky_relu_slope, + ), + ) + + channels = new_channels + + self.conv_post = weight_norm( + nn.Conv1d(channels, 1, 7, 1, padding=3, bias=False), + ) + self.conv_post.apply(init_weights) + + def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor = None): + f0_size = mel.shape[-1] + # change f0 helper to full size + f0 = F.interpolate( + f0.unsqueeze(1), + size=f0_size * self.upp, + mode="linear", + ) + # get f0 turned into sines harmonics + har_source = self.m_source(f0.transpose(1, 2)).transpose(1, 2) + # prepare for fusion to mel + x = self.pre_conv(har_source) + # downsampled/upchanneled versions for each upscale + downs = [] + for block, (old_size, new_size) in zip(self.downsample_blocks, self.df0): + x = F.leaky_relu(x, self.leaky_relu_slope) + downs.append(x) + # attempt to cancel spectral aliasing + x = torchaudio.functional.resample( + x.contiguous(), + orig_freq=int(f0_size * old_size), + new_freq=int(f0_size * new_size), + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="sinc_interp_kaiser", + beta=14.769656459379492, + ) + x = block(x) + + # expanding spectrogram from 192 to 256 channels + mel = self.mel_conv(mel) + + if g is not None: + # adding expanded speaker embedding + mel = mel + self.cond(g) + x = torch.cat([mel, x], dim=1) + + for ups, res, down in zip( + self.upsample_blocks, + self.upsample_conv_blocks, + reversed(downs), + strict=False, + ): + x = F.leaky_relu(x, self.leaky_relu_slope) + + if self.training and self.checkpointing: + x = checkpoint(ups, x, use_reentrant=False) + x = torch.cat([x, down], dim=1) + x = checkpoint(res, x, use_reentrant=False) + else: + x = ups(x) + x = torch.cat([x, down], dim=1) + x = res(x) + + x = F.leaky_relu(x, self.leaky_relu_slope) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + remove_weight_norm(self.pre_conv) + remove_weight_norm(self.mel_conv) + remove_weight_norm(self.conv_post) + + for block in self.downsample_blocks: + block.remove_weight_norm() + + for block in self.upsample_conv_blocks: + block.remove_weight_norm() +) diff --git a/rvc_logic/rvc/lib/algorithm/modules.py b/rvc_logic/rvc/lib/algorithm/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..06b0f50dc324935e86e18c6b3dfa4b9cf9962d37 --- /dev/null +++ b/rvc_logic/rvc/lib/algorithm/modules.py @@ -0,0 +1,120 @@ +import torch + +from rvc_logic.rvc.lib.algorithm.commons import fused_add_tanh_sigmoid_multiply + + +class WaveNet(torch.nn.Module): + """ + WaveNet residual blocks as used in WaveGlow. + + Args: + hidden_channels (int): Number of hidden channels. + kernel_size (int): Size of the convolutional kernel. + dilation_rate (int): Dilation rate of the convolution. + n_layers (int): Number of convolutional layers. + gin_channels (int, optional): Number of conditioning channels. Defaults to 0. + p_dropout (float, optional): Dropout probability. Defaults to 0. + + """ + + def __init__( + self, + hidden_channels: int, + kernel_size: int, + dilation_rate, + n_layers: int, + gin_channels: int = 0, + p_dropout: int = 0, + ): + super().__init__() + assert kernel_size % 2 == 1, "Kernel size must be odd for proper padding." + + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + self.n_channels_tensor = torch.IntTensor([hidden_channels]) # Static tensor + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = torch.nn.Dropout(p_dropout) + + # Conditional layer for global conditioning + if gin_channels: + self.cond_layer = torch.nn.utils.parametrizations.weight_norm( + torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1), + name="weight", + ) + + # Precompute dilations and paddings + dilations = [dilation_rate**i for i in range(n_layers)] + paddings = [(kernel_size * d - d) // 2 for d in dilations] + + # Initialize layers + for i in range(n_layers): + self.in_layers.append( + torch.nn.utils.parametrizations.weight_norm( + torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilations[i], + padding=paddings[i], + ), + name="weight", + ), + ) + + res_skip_channels = ( + hidden_channels if i == n_layers - 1 else 2 * hidden_channels + ) + self.res_skip_layers.append( + torch.nn.utils.parametrizations.weight_norm( + torch.nn.Conv1d(hidden_channels, res_skip_channels, 1), + name="weight", + ), + ) + + def forward(self, x, x_mask, g=None): + output = x.clone().zero_() + + # Apply conditional layer if global conditioning is provided + g = self.cond_layer(g) if g is not None else None + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + g_l = ( + g[ + :, + i * 2 * self.hidden_channels : (i + 1) * 2 * self.hidden_channels, + :, + ] + if g is not None + else 0 + ) + + # Activation with fused Tanh-Sigmoid + acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, self.n_channels_tensor) + acts = self.drop(acts) + + # Residual and skip connections + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for layer in self.in_layers: + torch.nn.utils.remove_weight_norm(layer) + for layer in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(layer) +) diff --git a/rvc_logic/rvc/lib/algorithm/normalization.py b/rvc_logic/rvc/lib/algorithm/normalization.py new file mode 100644 index 0000000000000000000000000000000000000000..c95f4ef76551d94a1a540797f65edd39d6c989de --- /dev/null +++ b/rvc_logic/rvc/lib/algorithm/normalization.py @@ -0,0 +1,31 @@ +import torch + + +class LayerNorm(torch.nn.Module): + """ + Layer normalization module. + + Args: + channels (int): Number of channels. + eps (float, optional): Epsilon value for numerical stability. Defaults to 1e-5. + + """ + + def __init__(self, channels: int, eps: float = 1e-5): + super().__init__() + self.eps = eps + self.gamma = torch.nn.Parameter(torch.ones(channels)) + self.beta = torch.nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + # Transpose to (batch_size, time_steps, channels) for layer_norm + x = x.transpose(1, -1) + x = torch.nn.functional.layer_norm( + x, + (x.size(-1),), + self.gamma, + self.beta, + self.eps, + ) + # Transpose back to (batch_size, channels, time_steps) + return x.transpose(1, -1) diff --git a/rvc_logic/rvc/lib/algorithm/residuals.py b/rvc_logic/rvc/lib/algorithm/residuals.py new file mode 100644 index 0000000000000000000000000000000000000000..aa7c66bebd13bdfff97ee55f010b37cfb6853ca5 --- /dev/null +++ b/rvc_logic/rvc/lib/algorithm/residuals.py @@ -0,0 +1,271 @@ +from typing import Optional, Tuple + +from itertools import chain + +import torch +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm + +from rvc_logic.rvc.lib.algorithm.commons import get_padding, init_weights +from rvc_logic.rvc.lib.algorithm.modules import WaveNet + +LRELU_SLOPE = 0.1 + + +def create_conv1d_layer(channels, kernel_size, dilation): + return weight_norm( + torch.nn.Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation, + padding=get_padding(kernel_size, dilation), + ), + ) + + +def apply_mask(tensor: torch.Tensor, mask: torch.Tensor | None): + return tensor * mask if mask else tensor + + +def apply_mask_(tensor: torch.Tensor, mask: torch.Tensor | None): + return tensor.mul_(mask) if mask else tensor + + +class ResBlock(torch.nn.Module): + """ + A residual block module that applies a series of 1D convolutional layers with residual connections. + """ + + def __init__( + self, + channels: int, + kernel_size: int = 3, + dilations: tuple[int] = (1, 3, 5), + ): + """ + Initializes the ResBlock. + + Args: + channels (int): Number of input and output channels for the convolution layers. + kernel_size (int): Size of the convolution kernel. Defaults to 3. + dilations (Tuple[int]): Tuple of dilation rates for the convolution layers in the first set. + + """ + super().__init__() + # Create convolutional layers with specified dilations and initialize weights + self.convs1 = self._create_convs(channels, kernel_size, dilations) + self.convs2 = self._create_convs(channels, kernel_size, [1] * len(dilations)) + + @staticmethod + def _create_convs(channels: int, kernel_size: int, dilations: tuple[int]): + """ + Creates a list of 1D convolutional layers with specified dilations. + + Args: + channels (int): Number of input and output channels for the convolution layers. + kernel_size (int): Size of the convolution kernel. + dilations (Tuple[int]): Tuple of dilation rates for each convolution layer. + + """ + layers = torch.nn.ModuleList( + [create_conv1d_layer(channels, kernel_size, d) for d in dilations], + ) + layers.apply(init_weights) + return layers + + def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None): + for conv1, conv2 in zip(self.convs1, self.convs2, strict=False): + x_residual = x + x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) + x = apply_mask(x, x_mask) + x = torch.nn.functional.leaky_relu(conv1(x), LRELU_SLOPE) + x = apply_mask(x, x_mask) + x = conv2(x) + x = x + x_residual + return apply_mask(x, x_mask) + + def remove_weight_norm(self): + for conv in chain(self.convs1, self.convs2): + remove_weight_norm(conv) + + +class Flip(torch.nn.Module): + """ + Flip module for flow-based models. + + This module flips the input along the time dimension. + """ + + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0), dtype=x.dtype, device=x.device) + return x, logdet + return x + + +class ResidualCouplingBlock(torch.nn.Module): + """ + Residual Coupling Block for normalizing flow. + + Args: + channels (int): Number of channels in the input. + hidden_channels (int): Number of hidden channels in the coupling layer. + kernel_size (int): Kernel size of the convolutional layers. + dilation_rate (int): Dilation rate of the convolutional layers. + n_layers (int): Number of layers in the coupling layer. + n_flows (int, optional): Number of coupling layers in the block. Defaults to 4. + gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0. + + """ + + def __init__( + self, + channels: int, + hidden_channels: int, + kernel_size: int, + dilation_rate: int, + n_layers: int, + n_flows: int = 4, + gin_channels: int = 0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = torch.nn.ModuleList() + for _ in range(n_flows): + self.flows.append( + ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ), + ) + self.flows.append(Flip()) + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: torch.Tensor | None = None, + reverse: bool = False, + ): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow.forward(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + def __prepare_scriptable__(self): + for i in range(self.n_flows): + for hook in self.flows[i * 2]._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flows[i * 2]) + + return self + + +class ResidualCouplingLayer(torch.nn.Module): + """ + Residual coupling layer for flow-based models. + + Args: + channels (int): Number of channels. + hidden_channels (int): Number of hidden channels. + kernel_size (int): Size of the convolutional kernel. + dilation_rate (int): Dilation rate of the convolution. + n_layers (int): Number of convolutional layers. + p_dropout (float, optional): Dropout probability. Defaults to 0. + gin_channels (int, optional): Number of conditioning channels. Defaults to 0. + mean_only (bool, optional): Whether to use mean-only coupling. Defaults to False. + + """ + + def __init__( + self, + channels: int, + hidden_channels: int, + kernel_size: int, + dilation_rate: int, + n_layers: int, + p_dropout: float = 0, + gin_channels: int = 0, + mean_only: bool = False, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = torch.nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WaveNet( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=p_dropout, + gin_channels=gin_channels, + ) + self.post = torch.nn.Conv1d( + hidden_channels, + self.half_channels * (2 - mean_only), + 1, + ) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: torch.Tensor | None = None, + reverse: bool = False, + ): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x + + def remove_weight_norm(self): + self.enc.remove_weight_norm() +rm() diff --git a/rvc_logic/rvc/lib/algorithm/synthesizers.py b/rvc_logic/rvc/lib/algorithm/synthesizers.py new file mode 100644 index 0000000000000000000000000000000000000000..92b5611f6ccede6a2eb9bb5ef856c42572e528dd --- /dev/null +++ b/rvc_logic/rvc/lib/algorithm/synthesizers.py @@ -0,0 +1,251 @@ +from typing import Optional + +import logging + +import torch + +from rvc_logic.rvc.lib.algorithm.commons import rand_slice_segments, slice_segments +from rvc_logic.rvc.lib.algorithm.encoders import PosteriorEncoder, TextEncoder +from rvc_logic.rvc.lib.algorithm.generators.hifigan import HiFiGANGenerator +from rvc_logic.rvc.lib.algorithm.generators.hifigan_mrf import HiFiGANMRFGenerator +from rvc_logic.rvc.lib.algorithm.generators.hifigan_nsf import HiFiGANNSFGenerator +from rvc_logic.rvc.lib.algorithm.generators.refinegan import RefineGANGenerator +from rvc_logic.rvc.lib.algorithm.residuals import ResidualCouplingBlock + +logger = logging.getLogger(__name__) + + +class Synthesizer(torch.nn.Module): + """ + Base Synthesizer model. + + Args: + spec_channels (int): Number of channels in the spectrogram. + segment_size (int): Size of the audio segment. + inter_channels (int): Number of channels in the intermediate layers. + hidden_channels (int): Number of channels in the hidden layers. + filter_channels (int): Number of channels in the filter layers. + n_heads (int): Number of attention heads. + n_layers (int): Number of layers in the encoder. + kernel_size (int): Size of the convolution kernel. + p_dropout (float): Dropout probability. + resblock (str): Type of residual block. + resblock_kernel_sizes (list): Kernel sizes for the residual blocks. + resblock_dilation_sizes (list): Dilation sizes for the residual blocks. + upsample_rates (list): Upsampling rates for the decoder. + upsample_initial_channel (int): Number of channels in the initial upsampling layer. + upsample_kernel_sizes (list): Kernel sizes for the upsampling layers. + spk_embed_dim (int): Dimension of the speaker embedding. + gin_channels (int): Number of channels in the global conditioning vector. + sr (int): Sampling rate of the audio. + use_f0 (bool): Whether to use F0 information. + text_enc_hidden_dim (int): Hidden dimension for the text encoder. + kwargs: Additional keyword arguments. + + """ + + def __init__( + self, + spec_channels: int, + segment_size: int, + inter_channels: int, + hidden_channels: int, + filter_channels: int, + n_heads: int, + n_layers: int, + kernel_size: int, + p_dropout: float, + resblock: str, + resblock_kernel_sizes: list, + resblock_dilation_sizes: list, + upsample_rates: list, + upsample_initial_channel: int, + upsample_kernel_sizes: list, + spk_embed_dim: int, + gin_channels: int, + sr: int, + use_f0: bool, + text_enc_hidden_dim: int = 768, + vocoder: str = "HiFi-GAN", + randomized: bool = True, + checkpointing: bool = False, + **kwargs, + ): + super().__init__() + self.segment_size = segment_size + self.use_f0 = use_f0 + self.randomized = randomized + + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + text_enc_hidden_dim, + f0=use_f0, + ) + logger.info("Using %s vocoder", vocoder) + if use_f0: + if vocoder == "MRF HiFi-GAN": + self.dec = HiFiGANMRFGenerator( + in_channel=inter_channels, + upsample_initial_channel=upsample_initial_channel, + upsample_rates=upsample_rates, + upsample_kernel_sizes=upsample_kernel_sizes, + resblock_kernel_sizes=resblock_kernel_sizes, + resblock_dilations=resblock_dilation_sizes, + gin_channels=gin_channels, + sample_rate=sr, + harmonic_num=8, + checkpointing=checkpointing, + ) + elif vocoder == "RefineGAN": + self.dec = RefineGANGenerator( + sample_rate=sr, + downsample_rates=upsample_rates[::-1], + upsample_rates=upsample_rates, + start_channels=16, + num_mels=inter_channels, + checkpointing=checkpointing, + ) + else: + self.dec = HiFiGANNSFGenerator( + inter_channels, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + checkpointing=checkpointing, + ) + elif vocoder == "MRF HiFi-GAN": + print("MRF HiFi-GAN does not support training without pitch guidance.") + self.dec = None + elif vocoder == "RefineGAN": + print("RefineGAN does not support training without pitch guidance.") + self.dec = None + else: + self.dec = HiFiGANGenerator( + inter_channels, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, + hidden_channels, + 5, + 1, + 3, + gin_channels=gin_channels, + ) + self.emb_g = torch.nn.Embedding(spk_embed_dim, gin_channels) + + def _remove_weight_norm_from(self, module): + for hook in module._forward_pre_hooks.values(): + if getattr(hook, "__class__", None).__name__ == "WeightNorm": + torch.nn.utils.remove_weight_norm(module) + + def remove_weight_norm(self): + for module in [self.dec, self.flow, self.enc_q]: + self._remove_weight_norm_from(module) + + def __prepare_scriptable__(self): + self.remove_weight_norm() + return self + + def forward( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: torch.Tensor | None = None, + pitchf: torch.Tensor | None = None, + y: torch.Tensor | None = None, + y_lengths: torch.Tensor | None = None, + ds: torch.Tensor | None = None, + ): + g = self.emb_g(ds).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + + if y is not None: + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + # regular old training method using random slices + if self.randomized: + z_slice, ids_slice = rand_slice_segments( + z, + y_lengths, + self.segment_size, + ) + if self.use_f0: + pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2) + o = self.dec(z_slice, pitchf, g=g) + else: + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + # future use for finetuning using the entire dataset each pass + if self.use_f0: + o = self.dec(z, pitchf, g=g) + else: + o = self.dec(z, g=g) + return o, None, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + return None, None, x_mask, None, (None, None, m_p, logs_p, None, None) + + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: torch.Tensor | None = None, + nsff0: torch.Tensor | None = None, + sid: torch.Tensor = None, + rate: torch.Tensor | None = None, + ): + """ + Inference of the model. + + Args: + phone (torch.Tensor): Phoneme sequence. + phone_lengths (torch.Tensor): Lengths of the phoneme sequences. + pitch (torch.Tensor, optional): Pitch sequence. + nsff0 (torch.Tensor, optional): Fine-grained pitch sequence. + sid (torch.Tensor): Speaker embedding. + rate (torch.Tensor, optional): Rate for time-stretching. + + """ + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + + if rate is not None: + head = int(z_p.shape[2] * (1.0 - rate.item())) + z_p, x_mask = z_p[:, :, head:], x_mask[:, :, head:] + if self.use_f0 and nsff0 is not None: + nsff0 = nsff0[:, head:] + + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = ( + self.dec(z * x_mask, nsff0, g=g) + if self.use_f0 + else self.dec(z * x_mask, g=g) + ) + + return o, x_mask, (z, z_p, m_p, logs_p) +, z_p, m_p, logs_p) diff --git a/rvc_logic/rvc/lib/predictors/F0Extractor.py b/rvc_logic/rvc/lib/predictors/F0Extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..358bc3772c0f023ddad9775c21e58082d7894f1e --- /dev/null +++ b/rvc_logic/rvc/lib/predictors/F0Extractor.py @@ -0,0 +1,111 @@ +import dataclasses +import os +import pathlib + +import resampy +import torchfcpe + +import numpy as np + +import torch +import torchcrepe + +import librosa + +from rvc_logic.common import RVC_MODELS_DIR +from rvc_logic.rvc.configs.config import Config + +# from tools.anyf0.rmvpe import RMVPE +from rvc_logic.rvc.lib.predictors.RMVPE import RMVPE0Predictor + +config = Config() + + +@dataclasses.dataclass +class F0Extractor: + wav_path: pathlib.Path + sample_rate: int = 44100 + hop_length: int = 512 + f0_min: int = 50 + f0_max: int = 1600 + method: str = "rmvpe" + x: np.ndarray = dataclasses.field(init=False) + + def __post_init__(self): + self.x, self.sample_rate = librosa.load(self.wav_path, sr=self.sample_rate) + + @property + def hop_size(self): + return self.hop_length / self.sample_rate + + @property + def wav16k(self): + return resampy.resample(self.x, self.sample_rate, 16000) + + def extract_f0(self): + f0 = None + method = self.method + if method == "crepe": + wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(config.device) + f0 = torchcrepe.predict( + wav16k_torch, + sample_rate=16000, + hop_length=160, + batch_size=512, + fmin=self.f0_min, + fmax=self.f0_max, + device=config.device, + ) + f0 = f0[0].cpu().numpy() + elif method == "fcpe": + audio = librosa.to_mono(self.x) + audio_length = len(audio) + f0_target_length = (audio_length // self.hop_length) + 1 + audio = ( + torch.from_numpy(audio) + .float() + .unsqueeze(0) + .unsqueeze(-1) + .to(config.device) + ) + model = torchfcpe.spawn_bundled_infer_model(device=config.device) + + f0 = model.infer( + audio, + sr=self.sample_rate, + decoder_mode="local_argmax", + threshold=0.006, + f0_min=self.f0_min, + f0_max=self.f0_max, + interp_uv=False, + output_interp_target_length=f0_target_length, + ) + f0 = f0.squeeze().cpu().numpy() + elif method == "rmvpe": + model_rmvpe = RMVPE0Predictor( + os.path.join(str(RVC_MODELS_DIR), "predictors", "rmvpe.pt"), + device=config.device, + ) + f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03) + + else: + raise ValueError(f"Unknown method: {self.method}") + return self.hz_to_cents(f0, librosa.midi_to_hz(0)) + + def plot_f0(self, f0): + from matplotlib import pyplot as plt + + plt.figure(figsize=(10, 4)) + plt.plot(f0) + plt.title(self.method) + plt.xlabel("Time (frames)") + plt.ylabel("F0 (cents)") + plt.show() + + @staticmethod + def hz_to_cents(F, F_ref=55.0): + F_temp = np.array(F).astype(float) + F_temp[F_temp == 0] = np.nan + F_cents = 1200 * np.log2(F_temp / F_ref) + return F_cents +F_cents diff --git a/rvc_logic/rvc/lib/predictors/FCPE.py b/rvc_logic/rvc/lib/predictors/FCPE.py new file mode 100644 index 0000000000000000000000000000000000000000..9f52aa5d86f43501534802d198c7a120e4aeba9f --- /dev/null +++ b/rvc_logic/rvc/lib/predictors/FCPE.py @@ -0,0 +1,965 @@ +import math +import os +from functools import partial + +import numpy as np + +import torch +import torch.nn.functional as F +import torch.utils.data +from einops import rearrange, repeat +from local_attention import LocalAttention +from torch import nn +from torch.nn.utils.parametrizations import weight_norm +from torchaudio.transforms import Resample + +import librosa +import soundfile as sf +from librosa.filters import mel as librosa_mel_fn + +os.environ["LRU_CACHE_CAPACITY"] = "3" + + +def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): + """Loads wav file to torch tensor.""" + try: + data, sample_rate = sf.read(full_path, always_2d=True) + except Exception as error: + print(f"An error occurred loading {full_path}: {error}") + if return_empty_on_exception: + return [], sample_rate or target_sr or 48000 + raise + + data = data[:, 0] if len(data.shape) > 1 else data + assert len(data) > 2 + + # Normalize data + max_mag = ( + -np.iinfo(data.dtype).min + if np.issubdtype(data.dtype, np.integer) + else max(np.amax(data), -np.amin(data)) + ) + max_mag = ( + (2**31) + 1 if max_mag > (2**15) else ((2**15) + 1 if max_mag > 1.01 else 1.0) + ) + data = torch.FloatTensor(data.astype(np.float32)) / max_mag + + # Handle exceptions and resample + if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception: + return [], sample_rate or target_sr or 48000 + if target_sr is not None and sample_rate != target_sr: + data = torch.from_numpy( + librosa.core.resample( + data.numpy(), + orig_sr=sample_rate, + target_sr=target_sr, + ), + ) + sample_rate = target_sr + + return data, sample_rate + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) + + +def dynamic_range_decompression(x, C=1): + return np.exp(x) / C + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + return torch.exp(x) / C + + +class STFT: + def __init__( + self, + sr=22050, + n_mels=80, + n_fft=1024, + win_size=1024, + hop_length=256, + fmin=20, + fmax=11025, + clip_val=1e-5, + ): + self.target_sr = sr + self.n_mels = n_mels + self.n_fft = n_fft + self.win_size = win_size + self.hop_length = hop_length + self.fmin = fmin + self.fmax = fmax + self.clip_val = clip_val + self.mel_basis = {} + self.hann_window = {} + + def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): + sample_rate = self.target_sr + n_mels = self.n_mels + n_fft = self.n_fft + win_size = self.win_size + hop_length = self.hop_length + fmin = self.fmin + fmax = self.fmax + clip_val = self.clip_val + + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(n_fft * factor)) + win_size_new = int(np.round(win_size * factor)) + hop_length_new = int(np.round(hop_length * speed)) + + # Optimize mel_basis and hann_window caching + mel_basis = self.mel_basis if not train else {} + hann_window = self.hann_window if not train else {} + + mel_basis_key = str(fmax) + "_" + str(y.device) + if mel_basis_key not in mel_basis: + mel = librosa_mel_fn( + sr=sample_rate, + n_fft=n_fft, + n_mels=n_mels, + fmin=fmin, + fmax=fmax, + ) + mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device) + + keyshift_key = str(keyshift) + "_" + str(y.device) + if keyshift_key not in hann_window: + hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device) + + # Padding and STFT + pad_left = (win_size_new - hop_length_new) // 2 + pad_right = max( + (win_size_new - hop_length_new + 1) // 2, + win_size_new - y.size(-1) - pad_left, + ) + mode = "reflect" if pad_right < y.size(-1) else "constant" + y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode) + y = y.squeeze(1) + + spec = torch.stft( + y, + n_fft=n_fft_new, + hop_length=hop_length_new, + win_length=win_size_new, + window=hann_window[keyshift_key], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9)) + + # Handle keyshift and mel conversion + if keyshift != 0: + size = n_fft // 2 + 1 + resize = spec.size(1) + spec = ( + F.pad(spec, (0, 0, 0, size - resize)) + if resize < size + else spec[:, :size, :] + ) + spec = spec * win_size / win_size_new + spec = torch.matmul(mel_basis[mel_basis_key], spec) + spec = dynamic_range_compression_torch(spec, clip_val=clip_val) + return spec + + def __call__(self, audiopath): + audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr) + spect = self.get_mel(audio.unsqueeze(0)).squeeze(0) + return spect + + +stft = STFT() + + +def softmax_kernel( + data, + *, + projection_matrix, + is_query, + normalize_data=True, + eps=1e-4, + device=None, +): + b, h, *_ = data.shape + + # Normalize data + data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0 + + # Project data + ratio = projection_matrix.shape[0] ** -0.5 + projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h) + projection = projection.type_as(data) + data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection) + + # Calculate diagonal data + diag_data = data**2 + diag_data = torch.sum(diag_data, dim=-1) + diag_data = (diag_data / 2.0) * (data_normalizer**2) + diag_data = diag_data.unsqueeze(dim=-1) + + # Apply softmax + if is_query: + data_dash = ratio * ( + torch.exp( + data_dash + - diag_data + - torch.max(data_dash, dim=-1, keepdim=True).values, + ) + + eps + ) + else: + data_dash = ratio * (torch.exp(data_dash - diag_data + eps)) + + return data_dash.type_as(data) + + +def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None): + unstructured_block = torch.randn((cols, cols), device=device) + q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced") + q, r = map(lambda t: t.to(device), (q, r)) + + if qr_uniform_q: + d = torch.diag(r, 0) + q *= d.sign() + return q.t() + + +def exists(val): + return val is not None + + +def empty(tensor): + return tensor.numel() == 0 + + +def default(val, d): + return val if exists(val) else d + + +def cast_tuple(val): + return (val,) if not isinstance(val, tuple) else val + + +class PCmer(nn.Module): + def __init__( + self, + num_layers, + num_heads, + dim_model, + dim_keys, + dim_values, + residual_dropout, + attention_dropout, + ): + super().__init__() + self.num_layers = num_layers + self.num_heads = num_heads + self.dim_model = dim_model + self.dim_values = dim_values + self.dim_keys = dim_keys + self.residual_dropout = residual_dropout + self.attention_dropout = attention_dropout + + self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)]) + + def forward(self, phone, mask=None): + for layer in self._layers: + phone = layer(phone, mask) + return phone + + +class _EncoderLayer(nn.Module): + def __init__(self, parent: PCmer): + super().__init__() + self.conformer = ConformerConvModule(parent.dim_model) + self.norm = nn.LayerNorm(parent.dim_model) + self.dropout = nn.Dropout(parent.residual_dropout) + self.attn = SelfAttention( + dim=parent.dim_model, + heads=parent.num_heads, + causal=False, + ) + + def forward(self, phone, mask=None): + phone = phone + (self.attn(self.norm(phone), mask=mask)) + phone = phone + (self.conformer(phone)) + return phone + + +def calc_same_padding(kernel_size): + pad = kernel_size // 2 + return (pad, pad - (kernel_size + 1) % 2) + + +class Swish(nn.Module): + def forward(self, x): + return x * x.sigmoid() + + +class Transpose(nn.Module): + def __init__(self, dims): + super().__init__() + assert len(dims) == 2, "dims must be a tuple of two dimensions" + self.dims = dims + + def forward(self, x): + return x.transpose(*self.dims) + + +class GLU(nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x): + out, gate = x.chunk(2, dim=self.dim) + return out * gate.sigmoid() + + +class DepthWiseConv1d(nn.Module): + def __init__(self, chan_in, chan_out, kernel_size, padding): + super().__init__() + self.padding = padding + self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in) + + def forward(self, x): + x = F.pad(x, self.padding) + return self.conv(x) + + +class ConformerConvModule(nn.Module): + def __init__( + self, + dim, + causal=False, + expansion_factor=2, + kernel_size=31, + dropout=0.0, + ): + super().__init__() + + inner_dim = dim * expansion_factor + padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0) + + self.net = nn.Sequential( + nn.LayerNorm(dim), + Transpose((1, 2)), + nn.Conv1d(dim, inner_dim * 2, 1), + GLU(dim=1), + DepthWiseConv1d( + inner_dim, + inner_dim, + kernel_size=kernel_size, + padding=padding, + ), + Swish(), + nn.Conv1d(inner_dim, dim, 1), + Transpose((1, 2)), + nn.Dropout(dropout), + ) + + def forward(self, x): + return self.net(x) + + +def linear_attention(q, k, v): + if v is None: + out = torch.einsum("...ed,...nd->...ne", k, q) + return out + k_cumsum = k.sum(dim=-2) + D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8) + context = torch.einsum("...nd,...ne->...de", k, v) + out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv) + return out + + +def gaussian_orthogonal_random_matrix( + nb_rows, + nb_columns, + scaling=0, + qr_uniform_q=False, + device=None, +): + nb_full_blocks = int(nb_rows / nb_columns) + block_list = [] + + for _ in range(nb_full_blocks): + q = orthogonal_matrix_chunk( + nb_columns, + qr_uniform_q=qr_uniform_q, + device=device, + ) + block_list.append(q) + + remaining_rows = nb_rows - nb_full_blocks * nb_columns + if remaining_rows > 0: + q = orthogonal_matrix_chunk( + nb_columns, + qr_uniform_q=qr_uniform_q, + device=device, + ) + block_list.append(q[:remaining_rows]) + + final_matrix = torch.cat(block_list) + + if scaling == 0: + multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1) + elif scaling == 1: + multiplier = math.sqrt(float(nb_columns)) * torch.ones( + (nb_rows,), + device=device, + ) + else: + raise ValueError(f"Invalid scaling {scaling}") + + return torch.diag(multiplier) @ final_matrix + + +class FastAttention(nn.Module): + def __init__( + self, + dim_heads, + nb_features=None, + ortho_scaling=0, + causal=False, + generalized_attention=False, + kernel_fn=nn.ReLU(), + qr_uniform_q=False, + no_projection=False, + ): + super().__init__() + nb_features = default(nb_features, int(dim_heads * math.log(dim_heads))) + + self.dim_heads = dim_heads + self.nb_features = nb_features + self.ortho_scaling = ortho_scaling + + self.create_projection = partial( + gaussian_orthogonal_random_matrix, + nb_rows=self.nb_features, + nb_columns=dim_heads, + scaling=ortho_scaling, + qr_uniform_q=qr_uniform_q, + ) + projection_matrix = self.create_projection() + self.register_buffer("projection_matrix", projection_matrix) + + self.generalized_attention = generalized_attention + self.kernel_fn = kernel_fn + self.no_projection = no_projection + self.causal = causal + + @torch.no_grad() + def redraw_projection_matrix(self): + projections = self.create_projection() + self.projection_matrix.copy_(projections) + del projections + + def forward(self, q, k, v): + device = q.device + + if self.no_projection: + q = q.softmax(dim=-1) + k = torch.exp(k) if self.causal else k.softmax(dim=-2) + else: + create_kernel = partial( + softmax_kernel, + projection_matrix=self.projection_matrix, + device=device, + ) + q = create_kernel(q, is_query=True) + k = create_kernel(k, is_query=False) + + attn_fn = linear_attention if not self.causal else self.causal_linear_fn + + if v is None: + out = attn_fn(q, k, None) + return out + out = attn_fn(q, k, v) + return out + + +class SelfAttention(nn.Module): + def __init__( + self, + dim, + causal=False, + heads=8, + dim_head=64, + local_heads=0, + local_window_size=256, + nb_features=None, + feature_redraw_interval=1000, + generalized_attention=False, + kernel_fn=nn.ReLU(), + qr_uniform_q=False, + dropout=0.0, + no_projection=False, + ): + super().__init__() + assert dim % heads == 0, "dimension must be divisible by number of heads" + dim_head = default(dim_head, dim // heads) + inner_dim = dim_head * heads + self.fast_attention = FastAttention( + dim_head, + nb_features, + causal=causal, + generalized_attention=generalized_attention, + kernel_fn=kernel_fn, + qr_uniform_q=qr_uniform_q, + no_projection=no_projection, + ) + + self.heads = heads + self.global_heads = heads - local_heads + self.local_attn = ( + LocalAttention( + window_size=local_window_size, + causal=causal, + autopad=True, + dropout=dropout, + look_forward=int(not causal), + rel_pos_emb_config=(dim_head, local_heads), + ) + if local_heads > 0 + else None + ) + + self.to_q = nn.Linear(dim, inner_dim) + self.to_k = nn.Linear(dim, inner_dim) + self.to_v = nn.Linear(dim, inner_dim) + self.to_out = nn.Linear(inner_dim, dim) + self.dropout = nn.Dropout(dropout) + + @torch.no_grad() + def redraw_projection_matrix(self): + self.fast_attention.redraw_projection_matrix() + + def forward( + self, + x, + context=None, + mask=None, + context_mask=None, + name=None, + inference=False, + **kwargs, + ): + _, _, _, h, gh = *x.shape, self.heads, self.global_heads + + cross_attend = exists(context) + context = default(context, x) + context_mask = default(context_mask, mask) if not cross_attend else context_mask + q, k, v = self.to_q(x), self.to_k(context), self.to_v(context) + + q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v)) + (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v)) + + attn_outs = [] + if not empty(q): + if exists(context_mask): + global_mask = context_mask[:, None, :, None] + v.masked_fill_(~global_mask, 0.0) + if cross_attend: + pass # TODO: Implement cross-attention + else: + out = self.fast_attention(q, k, v) + attn_outs.append(out) + + if not empty(lq): + assert ( + not cross_attend + ), "local attention is not compatible with cross attention" + out = self.local_attn(lq, lk, lv, input_mask=mask) + attn_outs.append(out) + + out = torch.cat(attn_outs, dim=1) + out = rearrange(out, "b h n d -> b n (h d)") + out = self.to_out(out) + return self.dropout(out) + + +def l2_regularization(model, l2_alpha): + l2_loss = [] + for module in model.modules(): + if type(module) is nn.Conv2d: + l2_loss.append((module.weight**2).sum() / 2.0) + return l2_alpha * sum(l2_loss) + + +class FCPE(nn.Module): + def __init__( + self, + input_channel=128, + out_dims=360, + n_layers=12, + n_chans=512, + use_siren=False, + use_full=False, + loss_mse_scale=10, + loss_l2_regularization=False, + loss_l2_regularization_scale=1, + loss_grad1_mse=False, + loss_grad1_mse_scale=1, + f0_max=1975.5, + f0_min=32.70, + confidence=False, + threshold=0.05, + use_input_conv=True, + ): + super().__init__() + if use_siren is True: + raise ValueError("Siren is not supported yet.") + if use_full is True: + raise ValueError("Full model is not supported yet.") + + self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10 + self.loss_l2_regularization = ( + loss_l2_regularization if (loss_l2_regularization is not None) else False + ) + self.loss_l2_regularization_scale = ( + loss_l2_regularization_scale + if (loss_l2_regularization_scale is not None) + else 1 + ) + self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False + self.loss_grad1_mse_scale = ( + loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1 + ) + self.f0_max = f0_max if (f0_max is not None) else 1975.5 + self.f0_min = f0_min if (f0_min is not None) else 32.70 + self.confidence = confidence if (confidence is not None) else False + self.threshold = threshold if (threshold is not None) else 0.05 + self.use_input_conv = use_input_conv if (use_input_conv is not None) else True + + self.cent_table_b = torch.Tensor( + np.linspace( + self.f0_to_cent(torch.Tensor([f0_min]))[0], + self.f0_to_cent(torch.Tensor([f0_max]))[0], + out_dims, + ), + ) + self.register_buffer("cent_table", self.cent_table_b) + + # conv in stack + _leaky = nn.LeakyReLU() + self.stack = nn.Sequential( + nn.Conv1d(input_channel, n_chans, 3, 1, 1), + nn.GroupNorm(4, n_chans), + _leaky, + nn.Conv1d(n_chans, n_chans, 3, 1, 1), + ) + + # transformer + self.decoder = PCmer( + num_layers=n_layers, + num_heads=8, + dim_model=n_chans, + dim_keys=n_chans, + dim_values=n_chans, + residual_dropout=0.1, + attention_dropout=0.1, + ) + self.norm = nn.LayerNorm(n_chans) + + # out + self.n_out = out_dims + self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out)) + + def forward( + self, + mel, + infer=True, + gt_f0=None, + return_hz_f0=False, + cdecoder="local_argmax", + ): + if cdecoder == "argmax": + self.cdecoder = self.cents_decoder + elif cdecoder == "local_argmax": + self.cdecoder = self.cents_local_decoder + + x = ( + self.stack(mel.transpose(1, 2)).transpose(1, 2) + if self.use_input_conv + else mel + ) + x = self.decoder(x) + x = self.norm(x) + x = self.dense_out(x) + x = torch.sigmoid(x) + + if not infer: + gt_cent_f0 = self.f0_to_cent(gt_f0) + gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0) + loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0) + if self.loss_l2_regularization: + loss_all = loss_all + l2_regularization( + model=self, + l2_alpha=self.loss_l2_regularization_scale, + ) + x = loss_all + if infer: + x = self.cdecoder(x) + x = self.cent_to_f0(x) + x = (1 + x / 700).log() if not return_hz_f0 else x + + return x + + def cents_decoder(self, y, mask=True): + B, N, _ = y.size() + ci = self.cent_table[None, None, :].expand(B, N, -1) + rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum( + y, + dim=-1, + keepdim=True, + ) + if mask: + confident = torch.max(y, dim=-1, keepdim=True)[0] + confident_mask = torch.ones_like(confident) + confident_mask[confident <= self.threshold] = float("-INF") + rtn = rtn * confident_mask + return (rtn, confident) if self.confidence else rtn + + def cents_local_decoder(self, y, mask=True): + B, N, _ = y.size() + ci = self.cent_table[None, None, :].expand(B, N, -1) + confident, max_index = torch.max(y, dim=-1, keepdim=True) + local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4) + local_argmax_index = torch.clamp(local_argmax_index, 0, self.n_out - 1) + ci_l = torch.gather(ci, -1, local_argmax_index) + y_l = torch.gather(y, -1, local_argmax_index) + rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum( + y_l, + dim=-1, + keepdim=True, + ) + if mask: + confident_mask = torch.ones_like(confident) + confident_mask[confident <= self.threshold] = float("-INF") + rtn = rtn * confident_mask + return (rtn, confident) if self.confidence else rtn + + def cent_to_f0(self, cent): + return 10.0 * 2 ** (cent / 1200.0) + + def f0_to_cent(self, f0): + return 1200.0 * torch.log2(f0 / 10.0) + + def gaussian_blurred_cent(self, cents): + mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0))) + B, N, _ = cents.size() + ci = self.cent_table[None, None, :].expand(B, N, -1) + return torch.exp(-torch.square(ci - cents) / 1250) * mask.float() + + +class FCPEInfer: + def __init__(self, model_path, device=None, dtype=torch.float32): + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device + ckpt = torch.load( + model_path, + map_location=torch.device(self.device), + weights_only=False, + ) + self.args = DotDict(ckpt["config"]) + self.dtype = dtype + model = FCPE( + input_channel=self.args.model.input_channel, + out_dims=self.args.model.out_dims, + n_layers=self.args.model.n_layers, + n_chans=self.args.model.n_chans, + use_siren=self.args.model.use_siren, + use_full=self.args.model.use_full, + loss_mse_scale=self.args.loss.loss_mse_scale, + loss_l2_regularization=self.args.loss.loss_l2_regularization, + loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale, + loss_grad1_mse=self.args.loss.loss_grad1_mse, + loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale, + f0_max=self.args.model.f0_max, + f0_min=self.args.model.f0_min, + confidence=self.args.model.confidence, + ) + model.to(self.device).to(self.dtype) + model.load_state_dict(ckpt["model"]) + model.eval() + self.model = model + self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device) + + @torch.no_grad() + def __call__(self, audio, sr, threshold=0.05): + self.model.threshold = threshold + audio = audio[None, :] + mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype) + f0 = self.model(mel=mel, infer=True, return_hz_f0=True) + return f0 + + +class Wav2Mel: + def __init__(self, args, device=None, dtype=torch.float32): + self.sample_rate = args.mel.sampling_rate + self.hop_size = args.mel.hop_size + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device + self.dtype = dtype + self.stft = STFT( + args.mel.sampling_rate, + args.mel.num_mels, + args.mel.n_fft, + args.mel.win_size, + args.mel.hop_size, + args.mel.fmin, + args.mel.fmax, + ) + self.resample_kernel = {} + + def extract_nvstft(self, audio, keyshift=0, train=False): + mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2) + return mel + + def extract_mel(self, audio, sample_rate, keyshift=0, train=False): + audio = audio.to(self.dtype).to(self.device) + if sample_rate == self.sample_rate: + audio_res = audio + else: + key_str = str(sample_rate) + if key_str not in self.resample_kernel: + self.resample_kernel[key_str] = Resample( + sample_rate, + self.sample_rate, + lowpass_filter_width=128, + ) + self.resample_kernel[key_str] = ( + self.resample_kernel[key_str].to(self.dtype).to(self.device) + ) + audio_res = self.resample_kernel[key_str](audio) + + mel = self.extract_nvstft( + audio_res, + keyshift=keyshift, + train=train, + ) # B, n_frames, bins + n_frames = int(audio.shape[1] // self.hop_size) + 1 + mel = ( + torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel + ) + mel = mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel + return mel + + def __call__(self, audio, sample_rate, keyshift=0, train=False): + return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train) + + +class DotDict(dict): + def __getattr__(*args): + val = dict.get(*args) + return DotDict(val) if type(val) is dict else val + + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + +class F0Predictor: + def compute_f0(self, wav, p_len): + pass + + def compute_f0_uv(self, wav, p_len): + pass + + +class FCPEF0Predictor(F0Predictor): + def __init__( + self, + model_path, + hop_length=512, + f0_min=50, + f0_max=1100, + dtype=torch.float32, + device=None, + sample_rate=44100, + threshold=0.05, + ): + self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype) + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.threshold = threshold + self.sample_rate = sample_rate + self.dtype = dtype + self.name = "fcpe" + + def repeat_expand( + self, + content: torch.Tensor | np.ndarray, + target_len: int, + mode: str = "nearest", + ): + ndim = content.ndim + content = ( + content[None, None] + if ndim == 1 + else content[None] if ndim == 2 else content + ) + assert content.ndim == 3 + is_np = isinstance(content, np.ndarray) + content = torch.from_numpy(content) if is_np else content + results = torch.nn.functional.interpolate(content, size=target_len, mode=mode) + results = results.numpy() if is_np else results + return results[0, 0] if ndim == 1 else results[0] if ndim == 2 else results + + def post_process(self, x, sample_rate, f0, pad_to): + f0 = ( + torch.from_numpy(f0).float().to(x.device) + if isinstance(f0, np.ndarray) + else f0 + ) + f0 = self.repeat_expand(f0, pad_to) if pad_to is not None else f0 + + vuv_vector = torch.zeros_like(f0) + vuv_vector[f0 > 0.0] = 1.0 + vuv_vector[f0 <= 0.0] = 0.0 + + nzindex = torch.nonzero(f0).squeeze() + f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy() + time_org = self.hop_length / sample_rate * nzindex.cpu().numpy() + time_frame = np.arange(pad_to) * self.hop_length / sample_rate + + vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0] + + if f0.shape[0] <= 0: + return np.zeros(pad_to), vuv_vector.cpu().numpy() + if f0.shape[0] == 1: + return np.ones(pad_to) * f0[0], vuv_vector.cpu().numpy() + + f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1]) + return f0, vuv_vector.cpu().numpy() + + def compute_f0(self, wav, p_len=None): + x = torch.FloatTensor(wav).to(self.dtype).to(self.device) + p_len = x.shape[0] // self.hop_length if p_len is None else p_len + f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] + if torch.all(f0 == 0): + return f0.cpu().numpy() if p_len is None else np.zeros(p_len) + return self.post_process(x, self.sample_rate, f0, p_len)[0] + + def compute_f0_uv(self, wav, p_len=None): + x = torch.FloatTensor(wav).to(self.dtype).to(self.device) + p_len = x.shape[0] // self.hop_length if p_len is None else p_len + f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] + if torch.all(f0 == 0): + return f0.cpu().numpy() if p_len is None else np.zeros(p_len), ( + f0.cpu().numpy() if p_len is None else np.zeros(p_len) + ) + return self.post_process(x, self.sample_rate, f0, p_len) diff --git a/rvc_logic/rvc/lib/predictors/RMVPE.py b/rvc_logic/rvc/lib/predictors/RMVPE.py new file mode 100644 index 0000000000000000000000000000000000000000..94e7e87e49fcd0dfe580382fb171661b35bd1e32 --- /dev/null +++ b/rvc_logic/rvc/lib/predictors/RMVPE.py @@ -0,0 +1,604 @@ +import numpy as np + +import torch +import torch.nn.functional as F +from torch import nn + +from librosa.filters import mel + +N_MELS = 128 +N_CLASS = 360 + + +class ConvBlockRes(nn.Module): + """ + A convolutional block with residual connection. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + momentum (float): Momentum for batch normalization. + + """ + + def __init__(self, in_channels, out_channels, momentum=0.01): + super().__init__() + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + if in_channels != out_channels: + self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) + self.is_shortcut = True + else: + self.is_shortcut = False + + def forward(self, x): + if self.is_shortcut: + return self.conv(x) + self.shortcut(x) + return self.conv(x) + x + + +class ResEncoderBlock(nn.Module): + """ + A residual encoder block. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (tuple): Size of the average pooling kernel. + n_blocks (int): Number of convolutional blocks in the block. + momentum (float): Momentum for batch normalization. + + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + n_blocks=1, + momentum=0.01, + ): + super().__init__() + self.n_blocks = n_blocks + self.conv = nn.ModuleList() + self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) + for _ in range(n_blocks - 1): + self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) + self.kernel_size = kernel_size + if self.kernel_size is not None: + self.pool = nn.AvgPool2d(kernel_size=kernel_size) + + def forward(self, x): + for i in range(self.n_blocks): + x = self.conv[i](x) + if self.kernel_size is not None: + return x, self.pool(x) + return x + + +class Encoder(nn.Module): + """ + The encoder part of the DeepUnet. + + Args: + in_channels (int): Number of input channels. + in_size (int): Size of the input tensor. + n_encoders (int): Number of encoder blocks. + kernel_size (tuple): Size of the average pooling kernel. + n_blocks (int): Number of convolutional blocks in each encoder block. + out_channels (int): Number of output channels for the first encoder block. + momentum (float): Momentum for batch normalization. + + """ + + def __init__( + self, + in_channels, + in_size, + n_encoders, + kernel_size, + n_blocks, + out_channels=16, + momentum=0.01, + ): + super().__init__() + self.n_encoders = n_encoders + self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) + self.layers = nn.ModuleList() + self.latent_channels = [] + for i in range(self.n_encoders): + self.layers.append( + ResEncoderBlock( + in_channels, + out_channels, + kernel_size, + n_blocks, + momentum=momentum, + ), + ) + self.latent_channels.append([out_channels, in_size]) + in_channels = out_channels + out_channels *= 2 + in_size //= 2 + self.out_size = in_size + self.out_channel = out_channels + + def forward(self, x: torch.Tensor): + concat_tensors: list[torch.Tensor] = [] + x = self.bn(x) + for i in range(self.n_encoders): + t, x = self.layers[i](x) + concat_tensors.append(t) + return x, concat_tensors + + +class Intermediate(nn.Module): + """ + The intermediate layer of the DeepUnet. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + n_inters (int): Number of convolutional blocks in the intermediate layer. + n_blocks (int): Number of convolutional blocks in each intermediate block. + momentum (float): Momentum for batch normalization. + + """ + + def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): + super().__init__() + self.n_inters = n_inters + self.layers = nn.ModuleList() + self.layers.append( + ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum), + ) + for _ in range(self.n_inters - 1): + self.layers.append( + ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum), + ) + + def forward(self, x): + for i in range(self.n_inters): + x = self.layers[i](x) + return x + + +class ResDecoderBlock(nn.Module): + """ + A residual decoder block. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + stride (tuple): Stride for transposed convolution. + n_blocks (int): Number of convolutional blocks in the block. + momentum (float): Momentum for batch normalization. + + """ + + def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): + super().__init__() + out_padding = (0, 1) if stride == (1, 2) else (1, 1) + self.n_blocks = n_blocks + self.conv1 = nn.Sequential( + nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=stride, + padding=(1, 1), + output_padding=out_padding, + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + self.conv2 = nn.ModuleList() + self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) + for _ in range(n_blocks - 1): + self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) + + def forward(self, x, concat_tensor): + x = self.conv1(x) + x = torch.cat((x, concat_tensor), dim=1) + for i in range(self.n_blocks): + x = self.conv2[i](x) + return x + + +class Decoder(nn.Module): + """ + The decoder part of the DeepUnet. + + Args: + in_channels (int): Number of input channels. + n_decoders (int): Number of decoder blocks. + stride (tuple): Stride for transposed convolution. + n_blocks (int): Number of convolutional blocks in each decoder block. + momentum (float): Momentum for batch normalization. + + """ + + def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): + super().__init__() + self.layers = nn.ModuleList() + self.n_decoders = n_decoders + for _ in range(self.n_decoders): + out_channels = in_channels // 2 + self.layers.append( + ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum), + ) + in_channels = out_channels + + def forward(self, x, concat_tensors): + for i in range(self.n_decoders): + x = self.layers[i](x, concat_tensors[-1 - i]) + return x + + +class DeepUnet(nn.Module): + """ + The DeepUnet architecture. + + Args: + kernel_size (tuple): Size of the average pooling kernel. + n_blocks (int): Number of convolutional blocks in each encoder/decoder block. + en_de_layers (int): Number of encoder/decoder layers. + inter_layers (int): Number of convolutional blocks in the intermediate layer. + in_channels (int): Number of input channels. + en_out_channels (int): Number of output channels for the first encoder block. + + """ + + def __init__( + self, + kernel_size, + n_blocks, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super().__init__() + self.encoder = Encoder( + in_channels, + 128, + en_de_layers, + kernel_size, + n_blocks, + en_out_channels, + ) + self.intermediate = Intermediate( + self.encoder.out_channel // 2, + self.encoder.out_channel, + inter_layers, + n_blocks, + ) + self.decoder = Decoder( + self.encoder.out_channel, + en_de_layers, + kernel_size, + n_blocks, + ) + + def forward(self, x): + x, concat_tensors = self.encoder(x) + x = self.intermediate(x) + x = self.decoder(x, concat_tensors) + return x + + +class E2E(nn.Module): + """ + The end-to-end model. + + Args: + n_blocks (int): Number of convolutional blocks in each encoder/decoder block. + n_gru (int): Number of GRU layers. + kernel_size (tuple): Size of the average pooling kernel. + en_de_layers (int): Number of encoder/decoder layers. + inter_layers (int): Number of convolutional blocks in the intermediate layer. + in_channels (int): Number of input channels. + en_out_channels (int): Number of output channels for the first encoder block. + + """ + + def __init__( + self, + n_blocks, + n_gru, + kernel_size, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super().__init__() + self.unet = DeepUnet( + kernel_size, + n_blocks, + en_de_layers, + inter_layers, + in_channels, + en_out_channels, + ) + self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) + if n_gru: + self.fc = nn.Sequential( + BiGRU(3 * 128, 256, n_gru), + nn.Linear(512, N_CLASS), + nn.Dropout(0.25), + nn.Sigmoid(), + ) + else: + self.fc = nn.Sequential( + nn.Linear(3 * N_MELS, N_CLASS), + nn.Dropout(0.25), + nn.Sigmoid(), + ) + + def forward(self, mel): + mel = mel.transpose(-1, -2).unsqueeze(1) + x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) + x = self.fc(x) + return x + + +class MelSpectrogram(torch.nn.Module): + """ + Extracts Mel-spectrogram features from audio. + + Args: + n_mel_channels (int): Number of Mel-frequency bands. + sample_rate (int): Sampling rate of the audio. + win_length (int): Length of the window function in samples. + hop_length (int): Hop size between frames in samples. + n_fft (int, optional): Length of the FFT window. Defaults to None, which uses win_length. + mel_fmin (int, optional): Minimum frequency for the Mel filter bank. Defaults to 0. + mel_fmax (int, optional): Maximum frequency for the Mel filter bank. Defaults to None. + clamp (float, optional): Minimum value for clamping the Mel-spectrogram. Defaults to 1e-5. + + """ + + def __init__( + self, + n_mel_channels, + sample_rate, + win_length, + hop_length, + n_fft=None, + mel_fmin=0, + mel_fmax=None, + clamp=1e-5, + ): + super().__init__() + n_fft = win_length if n_fft is None else n_fft + self.hann_window = {} + mel_basis = mel( + sr=sample_rate, + n_fft=n_fft, + n_mels=n_mel_channels, + fmin=mel_fmin, + fmax=mel_fmax, + htk=True, + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer("mel_basis", mel_basis) + self.n_fft = win_length if n_fft is None else n_fft + self.hop_length = hop_length + self.win_length = win_length + self.sample_rate = sample_rate + self.n_mel_channels = n_mel_channels + self.clamp = clamp + + def forward(self, audio, keyshift=0, speed=1, center=True): + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(self.n_fft * factor)) + win_length_new = int(np.round(self.win_length * factor)) + hop_length_new = int(np.round(self.hop_length * speed)) + keyshift_key = str(keyshift) + "_" + str(audio.device) + if keyshift_key not in self.hann_window: + self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( + audio.device, + ) + fft = torch.stft( + audio, + n_fft=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window=self.hann_window[keyshift_key], + center=center, + return_complex=True, + ) + + magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) + if keyshift != 0: + size = self.n_fft // 2 + 1 + resize = magnitude.size(1) + if resize < size: + magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) + magnitude = magnitude[:, :size, :] * self.win_length / win_length_new + mel_output = torch.matmul(self.mel_basis, magnitude) + log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) + return log_mel_spec + + +class RMVPE0Predictor: + """ + A predictor for fundamental frequency (F0) based on the RMVPE0 model. + + Args: + model_path (str): Path to the RMVPE0 model file. + device (str, optional): Device to use for computation. Defaults to None, which uses CUDA if available. + + """ + + def __init__(self, model_path, device=None): + self.resample_kernel = {} + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location="cpu", weights_only=False) + model.load_state_dict(ckpt) + model.eval() + self.model = model + self.resample_kernel = {} + self.device = device + self.mel_extractor = MelSpectrogram( + N_MELS, + 16000, + 1024, + 160, + None, + 30, + 8000, + ).to(device) + self.model = self.model.to(device) + cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191 + self.cents_mapping = np.pad(cents_mapping, (4, 4)) + + def mel2hidden(self, mel, chunk_size=32000): + """ + Converts Mel-spectrogram features to hidden representation. + + Args: + mel (torch.Tensor): Mel-spectrogram features. + + """ + with torch.no_grad(): + n_frames = mel.shape[-1] + # print('n_frames', n_frames) + # print('mel shape before padding', mel.shape) + mel = F.pad( + mel, + (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), + mode="reflect", + ) + # print('mel shape after padding', mel.shape) + + output_chunks = [] + pad_frames = mel.shape[-1] + for start in range(0, pad_frames, chunk_size): + # print('chunk @', start) + end = min(start + chunk_size, pad_frames) + mel_chunk = mel[..., start:end] + assert ( + mel_chunk.shape[-1] % 32 == 0 + ), "chunk_size must be divisible by 32" + # print(' before padding', mel_chunk.shape) + # mel_chunk = F.pad(mel_chunk, (320, 320), mode="reflect") + # print(' after padding', mel_chunk.shape) + + out_chunk = self.model(mel_chunk) + # print(' result chunk', out_chunk.shape) + # out_chunk = out_chunk[:, 320:-320, :] + # print(' trimmed chunk', out_chunk.shape) + output_chunks.append(out_chunk) + + hidden = torch.cat(output_chunks, dim=1) + # print('output', hidden[:, :n_frames].shape) + return hidden[:, :n_frames] + + def decode(self, hidden, thred=0.03): + """ + Decodes hidden representation to F0. + + Args: + hidden (np.ndarray): Hidden representation. + thred (float, optional): Threshold for salience. Defaults to 0.03. + + """ + cents_pred = self.to_local_average_cents(hidden, thred=thred) + f0 = 10 * (2 ** (cents_pred / 1200)) + f0[f0 == 10] = 0 + return f0 + + def infer_from_audio(self, audio, thred=0.03): + """ + Infers F0 from audio. + + Args: + audio (np.ndarray): Audio signal. + thred (float, optional): Threshold for salience. Defaults to 0.03. + + """ + audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0) + mel = self.mel_extractor(audio, center=True) + del audio + with torch.no_grad(): + torch.cuda.empty_cache() + hidden = self.mel2hidden(mel) + hidden = hidden.squeeze(0).cpu().numpy() + f0 = self.decode(hidden, thred=thred) + return f0 + + def to_local_average_cents(self, salience, thred=0.05): + """ + Converts salience to local average cents. + + Args: + salience (np.ndarray): Salience values. + thred (float, optional): Threshold for salience. Defaults to 0.05. + + """ + center = np.argmax(salience, axis=1) + salience = np.pad(salience, ((0, 0), (4, 4))) + center += 4 + todo_salience = [] + todo_cents_mapping = [] + starts = center - 4 + ends = center + 5 + for idx in range(salience.shape[0]): + todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) + todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) + todo_salience = np.array(todo_salience) + todo_cents_mapping = np.array(todo_cents_mapping) + product_sum = np.sum(todo_salience * todo_cents_mapping, 1) + weight_sum = np.sum(todo_salience, 1) + devided = product_sum / weight_sum + maxx = np.max(salience, axis=1) + devided[maxx <= thred] = 0 + return devided + + +class BiGRU(nn.Module): + """ + A bidirectional GRU layer. + + Args: + input_features (int): Number of input features. + hidden_features (int): Number of hidden features. + num_layers (int): Number of GRU layers. + + """ + + def __init__(self, input_features, hidden_features, num_layers): + super().__init__() + self.gru = nn.GRU( + input_features, + hidden_features, + num_layers=num_layers, + batch_first=True, + bidirectional=True, + ) + + def forward(self, x): + return self.gru(x)[0] diff --git a/rvc_logic/rvc/lib/predictors/f0.py b/rvc_logic/rvc/lib/predictors/f0.py new file mode 100644 index 0000000000000000000000000000000000000000..6167c46489c32afa9bb02970bc3c1b6a3fb0965a --- /dev/null +++ b/rvc_logic/rvc/lib/predictors/f0.py @@ -0,0 +1,92 @@ +import os + +from torchfcpe import spawn_infer_model_from_pt + +import torch +import torchcrepe + +from rvc_logic.common import RVC_MODELS_DIR +from rvc_logic.rvc.lib.predictors.RMVPE import RMVPE0Predictor + + +class RMVPE: + def __init__(self, device, model_name="rmvpe.pt", sample_rate=16000, hop_size=160): + self.device = device + self.sample_rate = sample_rate + self.hop_size = hop_size + self.model = RMVPE0Predictor( + os.path.join(RVC_MODELS_DIR, "predictors", model_name), + device=self.device, + ) + + def get_f0(self, x, filter_radius=0.03): + f0 = self.model.infer_from_audio(x, thred=filter_radius) + return f0 + + +class CREPE: + def __init__(self, device, sample_rate=16000, hop_size=160): + self.device = device + self.sample_rate = sample_rate + self.hop_size = hop_size + + def get_f0(self, x, f0_min=50, f0_max=1100, p_len=None, model="full"): + if p_len is None: # TODO p_len unused + p_len = x.shape[0] // self.hop_size + + if not torch.is_tensor(x): + x = torch.from_numpy(x) + + batch_size = 512 + + f0, pd = torchcrepe.predict( + x.float().to(self.device).unsqueeze(dim=0), + self.sample_rate, + self.hop_size, + f0_min, + f0_max, + model=model, + batch_size=batch_size, + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + + return f0 + + +class FCPE: + def __init__(self, device, sample_rate=16000, hop_size=160): + self.device = device + self.sample_rate = sample_rate + self.hop_size = hop_size + self.model = spawn_infer_model_from_pt( + os.path.join(RVC_MODELS_DIR, "predictors", "fcpe.pt"), + self.device, + bundled_model=True, + ) + + def get_f0(self, x, p_len=None, filter_radius=0.006): + if p_len is None: # TODO p_len unused + p_len = x.shape[0] // self.hop_size + + if not torch.is_tensor(x): + x = torch.from_numpy(x) + + f0 = ( + self.model.infer( + x.float().to(self.device).unsqueeze(0), + sr=self.sample_rate, + decoder_mode="local_argmax", + threshold=filter_radius, + ) + .squeeze() + .cpu() + .numpy() + ) + + return f0 +n f0 diff --git a/rvc_logic/rvc/lib/tools/analyzer.py b/rvc_logic/rvc/lib/tools/analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..474cde255a0c309899e4c1ba5bec3aea1b402396 --- /dev/null +++ b/rvc_logic/rvc/lib/tools/analyzer.py @@ -0,0 +1,77 @@ +import matplotlib.pyplot as plt +import numpy as np + +import librosa +import librosa.display + + +def calculate_features(y, sr): + stft = np.abs(librosa.stft(y)) + duration = librosa.get_duration(y=y, sr=sr) + cent = librosa.feature.spectral_centroid(S=stft, sr=sr)[0] + bw = librosa.feature.spectral_bandwidth(S=stft, sr=sr)[0] + rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sr)[0] + return stft, duration, cent, bw, rolloff + + +def plot_title(title): + plt.suptitle(title, fontsize=16, fontweight="bold") + + +def plot_spectrogram(y, sr, stft, duration, cmap="inferno"): + plt.subplot(3, 1, 1) + plt.imshow( + librosa.amplitude_to_db(stft, ref=np.max), + origin="lower", + extent=[0, duration, 0, sr / 1000], + aspect="auto", + cmap=cmap, # Change the colormap here + ) + plt.colorbar(format="%+2.0f dB") + plt.xlabel("Time (s)") + plt.ylabel("Frequency (kHz)") + plt.title("Spectrogram") + + +def plot_waveform(y, sr, duration): + plt.subplot(3, 1, 2) + librosa.display.waveshow(y, sr=sr) + plt.xlabel("Time (s)") + plt.ylabel("Amplitude") + plt.title("Waveform") + + +def plot_features(times, cent, bw, rolloff, duration): + plt.subplot(3, 1, 3) + plt.plot(times, cent, label="Spectral Centroid (kHz)", color="b") + plt.plot(times, bw, label="Spectral Bandwidth (kHz)", color="g") + plt.plot(times, rolloff, label="Spectral Rolloff (kHz)", color="r") + plt.xlabel("Time (s)") + plt.title("Spectral Features") + plt.legend() + + +def analyze_audio(audio_file, save_plot_path="logs/audio_analysis.png"): + y, sr = librosa.load(audio_file) + stft, duration, cent, bw, rolloff = calculate_features(y, sr) + + plt.figure(figsize=(12, 10)) + + plot_title("Audio Analysis" + " - " + audio_file.split("/")[-1]) + plot_spectrogram(y, sr, stft, duration) + plot_waveform(y, sr, duration) + plot_features(librosa.times_like(cent), cent, bw, rolloff, duration) + + plt.tight_layout() + + if save_plot_path: + plt.savefig(save_plot_path, bbox_inches="tight", dpi=300) + plt.close() + + audio_info = f"""Sample Rate: {sr}\nDuration: {( + str(round(duration, 2)) + " seconds" + if duration < 60 + else str(round(duration / 60, 2)) + " minutes" + )}\nNumber of Samples: {len(y)}\nBits per Sample: {librosa.get_samplerate(audio_file)}\nChannels: {"Mono (1)" if y.ndim == 1 else "Stereo (2)"}""" + + return audio_info, save_plot_path diff --git a/rvc_logic/rvc/lib/tools/gdown.py b/rvc_logic/rvc/lib/tools/gdown.py new file mode 100644 index 0000000000000000000000000000000000000000..f5ec3cddc1e1cd200ddbef4fb15331a19a470f68 --- /dev/null +++ b/rvc_logic/rvc/lib/tools/gdown.py @@ -0,0 +1,307 @@ +from typing import IO + +import json +import os +import pathlib +import re +import shutil +import sys +import tempfile +import time +import warnings +from urllib.parse import unquote, urlparse + +import requests + +from tqdm import tqdm + +CHUNK_SIZE = 512 * 1024 +HOME = os.path.expanduser("~") + + +def indent(text: str, prefix: str): + """Indent each non-empty line of text with the given prefix.""" + return "".join( + (prefix + line if line.strip() else line) for line in text.splitlines(True) + ) + + +class FileURLRetrievalError(Exception): + """Custom exception for issues retrieving file URLs.""" + + +def _extract_download_url_from_confirmation(contents: str, url_origin: str): + """Extract the download URL from a Google Drive confirmation page.""" + patterns = [ + r'href="(\/uc\?export=download[^"]+)', + r'href="/open\?id=([^"]+)"', + r'"downloadUrl":"([^"]+)', + ] + for pattern in patterns: + match = re.search(pattern, contents) + if match: + url = match.group(1) + if pattern == r'href="/open\?id=([^"]+)"': + uuid_match = re.search( + r'(.*)

', contents) + if error_match: + error = error_match.group(1) + raise FileURLRetrievalError(error) + + raise FileURLRetrievalError( + "Cannot retrieve the public link of the file. " + "You may need to change the permission to " + "'Anyone with the link', or have had many accesses.", + ) + + +def _create_session( + proxy: str | None = None, + use_cookies: bool = True, + return_cookies_file: bool = False, +): + """Create a requests session with optional proxy and cookie handling.""" + sess = requests.session() + sess.headers.update( + {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"}, + ) + + if proxy: + sess.proxies = {"http": proxy, "https": proxy} + + cookies_file = os.path.join(HOME, ".cache/gdown/cookies.json") + if pathlib.Path(cookies_file).exists() and use_cookies: + try: + with pathlib.Path(cookies_file).open() as f: + cookies = json.load(f) + for k, v in cookies: + sess.cookies[k] = v + except json.JSONDecodeError: + warnings.warn("Corrupted Cookies file") + + return (sess, cookies_file) if return_cookies_file else sess + + +def download( + output: str | None = None, + quiet: bool = False, + proxy: str | None = None, + speed: float | None = None, + use_cookies: bool = True, + verify: bool | str = True, + id: str | None = None, + fuzzy: bool = True, + resume: bool = False, + format: str | None = None, + url: str | None = None, +): + """ + Download a file from a URL, supporting Google Drive links. + + Args: + output: Output filepath. Default is basename of URL. + quiet: Suppress terminal output. + proxy: HTTP/HTTPS proxy. + speed: Download speed limit (bytes per second). + use_cookies: Flag to use cookies. + verify: Verify TLS certificates. + id: Google Drive's file ID. + fuzzy: Fuzzy Google Drive ID extraction. + resume: Resume download from a tmp file. + format: Format for Google Docs/Sheets/Slides. + url: URL to download from. + + Returns: + Output filename, or None on error. + + """ + if not (id is None) ^ (url is None): + raise ValueError("Either url or id has to be specified") + + if id is not None: + url = f"https://drive.google.com/uc?id={id}" + + url_origin = url + sess, cookies_file = _create_session( + proxy=proxy, + use_cookies=use_cookies, + return_cookies_file=True, + ) + + while True: + res = sess.get(url, stream=True, verify=verify) + res.raise_for_status() + + if url == url_origin and res.status_code == 500: + url = f"https://drive.google.com/open?id={id}" + continue + + if res.headers.get("Content-Type", "").startswith("text/html"): + title_match = re.search(r"(.+)", res.text) + if title_match: + title = title_match.group(1) + if title.endswith(" - Google Docs"): + url = ( + f"https://docs.google.com/document/d/{id}/export?format={'docx' if format is None else format}" + ) + continue + if title.endswith(" - Google Sheets"): + url = ( + f"https://docs.google.com/spreadsheets/d/{id}/export?format={'xlsx' if format is None else format}" + ) + continue + if title.endswith(" - Google Slides"): + url = ( + f"https://docs.google.com/presentation/d/{id}/export?format={'pptx' if format is None else format}" + ) + continue + if ( + "Content-Disposition" in res.headers + and res.headers["Content-Disposition"].endswith("pptx") + and format not in (None, "pptx") + ): + url = ( + f"https://docs.google.com/presentation/d/{id}/export?format={'pptx' if format is None else format}" + ) + continue + + if use_cookies: + pathlib.Path(os.path.dirname(cookies_file)).mkdir( + exist_ok=True, parents=True + ) + cookies = [ + (k, v) + for k, v in sess.cookies.items() + if not k.startswith("download_warning_") + ] + with pathlib.Path(cookies_file).open("w") as f: + json.dump(cookies, f, indent=2) + + if "Content-Disposition" in res.headers: + break + + parsed_url = urlparse(url) + is_gdrive = parsed_url.hostname in ("drive.google.com", "docs.google.com") + is_download_link = parsed_url.path.endswith("/uc") + + if not (is_gdrive and is_download_link and fuzzy): + break + + try: + url = _extract_download_url_from_confirmation(res.text, url_origin) + except FileURLRetrievalError as e: + raise FileURLRetrievalError(e) + + content_disposition = res.headers.get("Content-Disposition", "") + filename_match = re.search( + r"filename\*=UTF-8''(.*)", + content_disposition, + ) or re.search(r'filename=["\']?(.*?)["\']?$', content_disposition) + filename_from_url = ( + unquote(filename_match.group(1)) if filename_match else os.path.basename(url) + ) + download_path = output or filename_from_url + + if isinstance(download_path, str) and download_path.endswith(os.path.sep): + pathlib.Path(download_path).mkdir(exist_ok=True, parents=True) + download_path = os.path.join(download_path, filename_from_url) + + temp_dir = os.path.dirname(download_path) or "." + prefix = os.path.basename(download_path) + + if isinstance(download_path, str): + existing_tmp_files = [ + os.path.join(temp_dir, file) + for file in os.listdir(temp_dir) + if file.startswith(prefix) + ] + if resume and existing_tmp_files: + if len(existing_tmp_files) > 1: + print( + "There are multiple temporary files to resume:", + file=sys.stderr, + ) + for file in existing_tmp_files: + print(f"\t{file}", file=sys.stderr) + print( + "Please remove them except one to resume downloading.", + file=sys.stderr, + ) + return None + temp_file_path = existing_tmp_files[0] + else: + resume = False + temp_file_path = tempfile.mktemp( + suffix=tempfile.template, + prefix=prefix, + dir=temp_dir, + ) + + try: + file_obj: IO = pathlib.Path(temp_file_path).open("ab") + except Exception as e: + print( + f"Could not open the temporary file {temp_file_path}: {e}", + file=sys.stderr, + ) + return None + else: + temp_file_path = None + file_obj = download_path + + if temp_file_path is not None and file_obj.tell() != 0: + headers = {"Range": f"bytes={file_obj.tell()}-"} + res = sess.get(url, headers=headers, stream=True, verify=verify) + res.raise_for_status() + + try: + total = int(res.headers.get("Content-Length", 0)) + if total > 0: + if not quiet: + pbar = tqdm( + total=total, + unit="B", + unit_scale=True, + desc=filename_from_url, + ) + elif not quiet: + pbar = tqdm(unit="B", unit_scale=True, desc=filename_from_url) + + t_start = time.time() + for chunk in res.iter_content(chunk_size=CHUNK_SIZE): + file_obj.write(chunk) + if not quiet: + pbar.update(len(chunk)) + if speed is not None: + elapsed_time_expected = 1.0 * pbar.n / speed + elapsed_time = time.time() - t_start + if elapsed_time < elapsed_time_expected: + time.sleep(elapsed_time_expected - elapsed_time) + if not quiet: + pbar.close() + + if temp_file_path: + file_obj.close() + shutil.move(temp_file_path, download_path) + finally: + sess.close() + + return download_path diff --git a/rvc_logic/rvc/lib/tools/launch_tensorboard.py b/rvc_logic/rvc/lib/tools/launch_tensorboard.py new file mode 100644 index 0000000000000000000000000000000000000000..e448606f42307fd1f39df858da10400240e526d2 --- /dev/null +++ b/rvc_logic/rvc/lib/tools/launch_tensorboard.py @@ -0,0 +1,23 @@ +import logging +import time + +from tensorboard import program + +log_path = "logs" + + +def launch_tensorboard_pipeline(): + logging.getLogger("root").setLevel(logging.WARNING) + logging.getLogger("tensorboard").setLevel(logging.WARNING) + + tb = program.TensorBoard() + tb.configure(argv=[None, "--logdir", log_path]) + url = tb.launch() + + print( + "Access the tensorboard using the following" + f" link:\n{url}?pinnedCards=%5B%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fd%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fkl%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fmel%22%7D%5D", + ) + + while True: + time.sleep(600) diff --git a/rvc_logic/rvc/lib/tools/model_download.py b/rvc_logic/rvc/lib/tools/model_download.py new file mode 100644 index 0000000000000000000000000000000000000000..6960c541a13e10c5cdafee36a3ed1c70af2f8530 --- /dev/null +++ b/rvc_logic/rvc/lib/tools/model_download.py @@ -0,0 +1,238 @@ +import os +import pathlib +import re +import shutil +import sys +import zipfile +from urllib.parse import unquote + +from bs4 import BeautifulSoup + +import requests + +from tqdm import tqdm + +now_dir = pathlib.Path.cwd() +sys.path.append(str(now_dir)) + +from rvc_logic.rvc.lib.tools import gdown +from rvc_logic.rvc.lib.utils import format_title + +file_path = os.path.join(now_dir, "logs") +zips_path = os.path.join(file_path, "zips") +pathlib.Path(zips_path).mkdir(exist_ok=True, parents=True) + + +def search_pth_index(folder): + pth_paths = [ + os.path.join(folder, file) + for file in os.listdir(folder) + if pathlib.Path(os.path.join(folder, file)).is_file() and file.endswith(".pth") + ] + index_paths = [ + os.path.join(folder, file) + for file in os.listdir(folder) + if pathlib.Path(os.path.join(folder, file)).is_file() + and file.endswith(".index") + ] + return pth_paths, index_paths + + +def download_from_url(url): + os.chdir(zips_path) + + try: + if "drive.google.com" in url: + file_id = extract_google_drive_id(url) + if file_id: + gdown.download( + url=f"https://drive.google.com/uc?id={file_id}", + quiet=False, + fuzzy=True, + ) + elif "/blob/" in url or "/resolve/" in url: + download_blob_or_resolve(url) + elif "/tree/main" in url: + download_from_huggingface(url) + else: + download_file(url) + + rename_downloaded_files() + return "downloaded" + except Exception as error: + print(f"An error occurred downloading the file: {error}") + return None + finally: + os.chdir(now_dir) + + +def extract_google_drive_id(url): + if "file/d/" in url: + return url.split("file/d/")[1].split("/")[0] + if "id=" in url: + return url.split("id=")[1].split("&")[0] + return None + + +def download_blob_or_resolve(url): + if "/blob/" in url: + url = url.replace("/blob/", "/resolve/") + response = requests.get(url, stream=True) + if response.status_code == 200: + save_response_content(response) + else: + raise ValueError( + "Download failed with status code: " + str(response.status_code), + ) + + +def save_response_content(response): + content_disposition = unquote(response.headers.get("Content-Disposition", "")) + file_name = ( + re.search(r'filename="([^"]+)"', content_disposition) + .groups()[0] + .replace(os.path.sep, "_") + if content_disposition + else "downloaded_file" + ) + + total_size = int(response.headers.get("Content-Length", 0)) + chunk_size = 1024 + + with ( + pathlib.Path(os.path.join(zips_path, file_name)).open("wb") as file, + tqdm( + total=total_size, + unit="B", + unit_scale=True, + desc=file_name, + ) as progress_bar, + ): + for data in response.iter_content(chunk_size): + file.write(data) + progress_bar.update(len(data)) + + +def download_from_huggingface(url): + response = requests.get(url) + soup = BeautifulSoup(response.content, "html.parser") + temp_url = next( + ( + link["href"] + for link in soup.find_all("a", href=True) + if link["href"].endswith(".zip") + ), + None, + ) + if temp_url: + url = temp_url.replace("blob", "resolve") + if "huggingface.co" not in url: + url = "https://huggingface.co" + url + download_file(url) + else: + raise ValueError("No zip file found in Huggingface URL") + + +def download_file(url): + response = requests.get(url, stream=True) + if response.status_code == 200: + save_response_content(response) + else: + raise ValueError( + "Download failed with status code: " + str(response.status_code), + ) + + +def rename_downloaded_files(): + for currentPath, _, zipFiles in os.walk(zips_path): + for file in zipFiles: + file_name, extension = os.path.splitext(file) + real_path = os.path.join(currentPath, file) + pathlib.Path(real_path).rename( + file_name.replace(os.path.sep, "_") + extension + ) + + +def extract(zipfile_path, unzips_path): + try: + with zipfile.ZipFile(zipfile_path, "r") as zip_ref: + zip_ref.extractall(unzips_path) + pathlib.Path(zipfile_path).unlink() + return True + except Exception as error: + print(f"An error occurred extracting the zip file: {error}") + return False + + +def unzip_file(zip_path, zip_file_name): + zip_file_path = os.path.join(zip_path, zip_file_name + ".zip") + extract_path = os.path.join(file_path, zip_file_name) + with zipfile.ZipFile(zip_file_path, "r") as zip_ref: + zip_ref.extractall(extract_path) + pathlib.Path(zip_file_path).unlink() + + +def model_download_pipeline(url: str): + try: + result = download_from_url(url) + if result == "downloaded": + return handle_extraction_process() + return "Error" + except Exception as error: + print(f"An unexpected error occurred: {error}") + return "Error" + + +def handle_extraction_process(): + extract_folder_path = "" + for filename in os.listdir(zips_path): + if filename.endswith(".zip"): + zipfile_path = os.path.join(zips_path, filename) + model_name = format_title(os.path.basename(zipfile_path).split(".zip")[0]) + extract_folder_path = os.path.join("logs", os.path.normpath(model_name)) + success = extract(zipfile_path, extract_folder_path) + clean_extracted_files(extract_folder_path, model_name) + + if success: + print(f"Model {model_name} downloaded!") + else: + print(f"Error downloading {model_name}") + return "Error" + if not extract_folder_path: + print("Zip file was not found.") + return "Error" + return search_pth_index(extract_folder_path) + + +def clean_extracted_files(extract_folder_path, model_name): + macosx_path = os.path.join(extract_folder_path, "__MACOSX") + if pathlib.Path(macosx_path).exists(): + shutil.rmtree(macosx_path) + + subfolders = [ + f + for f in os.listdir(extract_folder_path) + if pathlib.Path(os.path.join(extract_folder_path, f)).is_dir() + ] + if len(subfolders) == 1: + subfolder_path = os.path.join(extract_folder_path, subfolders[0]) + for item in os.listdir(subfolder_path): + shutil.move( + os.path.join(subfolder_path, item), + os.path.join(extract_folder_path, item), + ) + pathlib.Path(subfolder_path).rmdir() + + for item in os.listdir(extract_folder_path): + source_path = os.path.join(extract_folder_path, item) + if ".pth" in item: + new_file_name = model_name + ".pth" + elif ".index" in item: + new_file_name = model_name + ".index" + else: + continue + + destination_path = os.path.join(extract_folder_path, new_file_name) + if not pathlib.Path(destination_path).exists(): + pathlib.Path(source_path).rename(destination_path) +ath) diff --git a/rvc_logic/rvc/lib/tools/prerequisites_download.py b/rvc_logic/rvc/lib/tools/prerequisites_download.py new file mode 100644 index 0000000000000000000000000000000000000000..6f14ab10a266ad952ef48104a4df73cc5a7b2c21 --- /dev/null +++ b/rvc_logic/rvc/lib/tools/prerequisites_download.py @@ -0,0 +1,198 @@ +from typing import TYPE_CHECKING + +import lazy_loader as lazy + +import os +import pathlib +from concurrent.futures import ThreadPoolExecutor + +from rvc_logic.common import ( + EMBEDDER_MODELS_DIR, + PRETRAINED_MODELS_DIR, + RVC_MODELS_DIR, +) + +if TYPE_CHECKING: + import requests + + import tqdm + +else: + tqdm = lazy.load("tqdm") + requests = lazy.load("requests") + + +url_base = "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources" + +pretraineds_hifigan_list = [ + ( + "pretrained_v2/", + [ + "f0D32k.pth", + "f0D40k.pth", + "f0D48k.pth", + "f0G32k.pth", + "f0G40k.pth", + "f0G48k.pth", + ], + ), +] +pretraineds_refinegan_list = [("refinegan/", ["f0D32k.pth", "f0G32k.pth"])] +models_list = [("predictors/", ["rmvpe.pt", "fcpe.pt"])] +embedders_list = [ + ("embedders/contentvec/", ["pytorch_model.bin", "config.json"]), + ("embedders/chinese_hubert_base/", ["pytorch_model.bin", "config.json"]), + ("embedders/japanese_hubert_base/", ["pytorch_model.bin", "config.json"]), + ("embedders/korean_hubert_base/", ["pytorch_model.bin", "config.json"]), + ("embedders/spin/", ["pytorch_model.bin", "config.json"]), + ("embedders/spin-v2/", ["pytorch_model.bin", "config.json"]), +] +executables_list = [ + ("", ["ffmpeg.exe", "ffprobe.exe"]), +] + +folder_mapping_list = { + "pretrained_v2/": str(PRETRAINED_MODELS_DIR / "hifi-gan/"), + "refinegan/": str(PRETRAINED_MODELS_DIR / "refinegan/"), + "embedders/contentvec/": str(EMBEDDER_MODELS_DIR / "contentvec/"), + "embedders/chinese_hubert_base/": str( + EMBEDDER_MODELS_DIR / "chinese_hubert_base/", + ), + "embedders/japanese_hubert_base/": str( + EMBEDDER_MODELS_DIR / "japanese_hubert_base/", + ), + "embedders/korean_hubert_base/": str( + EMBEDDER_MODELS_DIR / "korean_hubert_base/", + ), + "embedders/spin/": str(EMBEDDER_MODELS_DIR / "spin/"), + "embedders/spin-v2/": str(EMBEDDER_MODELS_DIR / "spin-v2/"), + "predictors/": str(RVC_MODELS_DIR / "predictors/"), + "formant/": str(RVC_MODELS_DIR / "formant/"), +} + + +def get_file_size_if_missing(file_list): + """ + Calculate the total size of files to be downloaded only if they do not exist locally. + """ + total_size = 0 + for remote_folder, files in file_list: + local_folder = folder_mapping_list.get(remote_folder, "") + for file in files: + destination_path = os.path.join(local_folder, file) + if not pathlib.Path(destination_path).exists(): + url = f"{url_base}/{remote_folder}{file}" + response = requests.head(url) + total_size += int(response.headers.get("content-length", 0)) + return total_size + + +def download_file(url, destination_path, global_bar): + """ + Download a file from the given URL to the specified destination path, + updating the global progress bar as data is downloaded. + """ + dir_name = os.path.dirname(destination_path) + if dir_name: + pathlib.Path(dir_name).mkdir(exist_ok=True, parents=True) + response = requests.get(url, stream=True) + block_size = 1024 + with pathlib.Path(destination_path).open("wb") as file: + for data in response.iter_content(block_size): + file.write(data) + global_bar.update(len(data)) + + +def download_mapping_files(file_mapping_list, global_bar): + """ + Download all files in the provided file mapping list using a thread pool executor, + and update the global progress bar as downloads progress. + """ + with ThreadPoolExecutor() as executor: + futures = [] + for remote_folder, file_list in file_mapping_list: + local_folder = folder_mapping_list.get(remote_folder, "") + for file in file_list: + destination_path = os.path.join(local_folder, file) + if not pathlib.Path(destination_path).exists(): + url = f"{url_base}/{remote_folder}{file}" + futures.append( + executor.submit( + download_file, + url, + destination_path, + global_bar, + ), + ) + for future in futures: + future.result() + + +def split_pretraineds(pretrained_list): + f0_list = [] + non_f0_list = [] + for folder, files in pretrained_list: + f0_files = [f for f in files if f.startswith("f0")] + non_f0_files = [f for f in files if not f.startswith("f0")] + if f0_files: + f0_list.append((folder, f0_files)) + if non_f0_files: + non_f0_list.append((folder, non_f0_files)) + return f0_list, non_f0_list + + +pretraineds_hifigan_list, _ = split_pretraineds(pretraineds_hifigan_list) + + +def calculate_total_size( + pretraineds_hifigan, + models, + exe, +): + """ + Calculate the total size of all files to be downloaded based on selected categories. + """ + total_size = 0 + if models: + total_size += get_file_size_if_missing(models_list) + total_size += get_file_size_if_missing(embedders_list) + if exe and os.name == "nt": + total_size += get_file_size_if_missing(executables_list) + total_size += get_file_size_if_missing(pretraineds_hifigan) + total_size += get_file_size_if_missing(pretraineds_refinegan_list) + return total_size + + +def prequisites_download_pipeline( + pretraineds_hifigan: bool = True, + models: bool = True, + exe: bool = True, +) -> None: + """ + Manage the download pipeline for different categories of files. + """ + total_size = calculate_total_size( + pretraineds_hifigan_list if pretraineds_hifigan else [], + models, + exe, + ) + + if total_size > 0: + with tqdm.tqdm( + total=total_size, + unit="iB", + unit_scale=True, + desc="Downloading all files", + ) as global_bar: + if models: + download_mapping_files(models_list, global_bar) + download_mapping_files(embedders_list, global_bar) + if exe: + if os.name == "nt": + download_mapping_files(executables_list, global_bar) + else: + print("No executables needed") + if pretraineds_hifigan: + download_mapping_files(pretraineds_hifigan_list, global_bar) + download_mapping_files(pretraineds_refinegan_list, global_bar) +) diff --git a/rvc_logic/rvc/lib/tools/pretrained_selector.py b/rvc_logic/rvc/lib/tools/pretrained_selector.py new file mode 100644 index 0000000000000000000000000000000000000000..85c8177b050fb10de320dd8e4fd7048027ea6e7c --- /dev/null +++ b/rvc_logic/rvc/lib/tools/pretrained_selector.py @@ -0,0 +1,16 @@ +import os +import pathlib + +from rvc_logic.common import PRETRAINED_MODELS_DIR + + +def pretrained_selector(vocoder: str, sample_rate: int) -> tuple[str, str]: + base_path = os.path.join(PRETRAINED_MODELS_DIR, f"{vocoder.lower()}") + + path_g = os.path.join(base_path, f"f0G{str(sample_rate)[:2]}k.pth") + path_d = os.path.join(base_path, f"f0D{str(sample_rate)[:2]}k.pth") + + if pathlib.Path(path_g).exists() and pathlib.Path(path_d).exists(): + return path_g, path_d + return "", "" +" diff --git a/rvc_logic/rvc/lib/tools/split_audio.py b/rvc_logic/rvc/lib/tools/split_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..acbe27c70e9c11e16a088acb3ff688e944f94d05 --- /dev/null +++ b/rvc_logic/rvc/lib/tools/split_audio.py @@ -0,0 +1,89 @@ +import numpy as np + +import librosa + + +def process_audio(audio, sr=16000, silence_thresh=-60, min_silence_len=250): + """ + Splits an audio signal into segments using a fixed frame size and hop size. + + Parameters + ---------- + - audio (np.ndarray): The audio signal to split. + - sr (int): The sample rate of the input audio (default is 16000). + - silence_thresh (int): Silence threshold (default =-60dB) + - min_silence_len (int): Minimum silence duration (default 250ms). + + Returns + ------- + - list of np.ndarray: A list of audio segments. + - np.ndarray: The intervals where the audio was split. + + """ + frame_length = int(min_silence_len / 1000 * sr) + hop_length = frame_length // 2 + intervals = librosa.effects.split( + audio, + top_db=-silence_thresh, + frame_length=frame_length, + hop_length=hop_length, + ) + audio_segments = [audio[start:end] for start, end in intervals] + + return audio_segments, intervals + + +def merge_audio(audio_segments_org, audio_segments_new, intervals, sr_orig, sr_new): + """ + Merges audio segments back into a single audio signal, filling gaps with silence. + Assumes audio segments are already at sr_new. + + Parameters + ---------- + - audio_segments_org (list of np.ndarray): The non-silent audio segments (at sr_orig). + - audio_segments_new (list of np.ndarray): The non-silent audio segments (at sr_new). + - intervals (np.ndarray): The intervals used for splitting the original audio. + - sr_orig (int): The sample rate of the original audio + - sr_new (int): The sample rate of the model + Returns: + - np.ndarray: The merged audio signal with silent gaps restored. + + """ + merged_audio = np.array([], dtype=audio_segments_new[0].dtype) + sr_ratio = sr_new / sr_orig + + for i, (start, end) in enumerate(intervals): + + start_new = int(start * sr_ratio) + end_new = int(end * sr_ratio) + + original_duration = len(audio_segments_org[i]) / sr_orig + new_duration = len(audio_segments_new[i]) / sr_new + duration_diff = new_duration - original_duration + + silence_samples = int(abs(duration_diff) * sr_new) + silence_compensation = np.zeros( + silence_samples, + dtype=audio_segments_new[0].dtype, + ) + + if i == 0 and start_new > 0: + initial_silence = np.zeros(start_new, dtype=audio_segments_new[0].dtype) + merged_audio = np.concatenate((merged_audio, initial_silence)) + + if duration_diff > 0: + merged_audio = np.concatenate((merged_audio, silence_compensation)) + + merged_audio = np.concatenate((merged_audio, audio_segments_new[i])) + + if duration_diff < 0: + merged_audio = np.concatenate((merged_audio, silence_compensation)) + + if i < len(intervals) - 1: + next_start_new = int(intervals[i + 1][0] * sr_ratio) + silence_duration = next_start_new - end_new + if silence_duration > 0: + silence = np.zeros(silence_duration, dtype=audio_segments_new[0].dtype) + merged_audio = np.concatenate((merged_audio, silence)) + + return merged_audio diff --git a/rvc_logic/rvc/lib/tools/tts.py b/rvc_logic/rvc/lib/tools/tts.py new file mode 100644 index 0000000000000000000000000000000000000000..827c179c873d13316523b8ec887e2e2f4329c623 --- /dev/null +++ b/rvc_logic/rvc/lib/tools/tts.py @@ -0,0 +1,30 @@ +import asyncio +import pathlib +import sys + +import edge_tts + + +async def main(): + # Parse command line arguments + tts_file = str(sys.argv[1]) + text = str(sys.argv[2]) + voice = str(sys.argv[3]) + rate = int(sys.argv[4]) + output_file = str(sys.argv[5]) + + rates = f"+{rate}%" if rate >= 0 else f"{rate}%" + if tts_file and pathlib.Path(tts_file).exists(): + text = "" + try: + with pathlib.Path(tts_file).open(encoding="utf-8") as file: + text = file.read() + except UnicodeDecodeError: + with pathlib.Path(tts_file).open() as file: + text = file.read() + await edge_tts.Communicate(text, voice, rate=rates).save(output_file) + # print(f"TTS with {voice} completed. Output TTS file: '{output_file}'") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/rvc_logic/rvc/lib/tools/tts_voices.json b/rvc_logic/rvc/lib/tools/tts_voices.json new file mode 100644 index 0000000000000000000000000000000000000000..5f5fea686dbcbcf850e37f3e010f92696a7a6258 --- /dev/null +++ b/rvc_logic/rvc/lib/tools/tts_voices.json @@ -0,0 +1,5820 @@ +[ + { + "Name": "Microsoft Server Speech Text to Speech Voice (af-ZA, AdriNeural)", + "ShortName": "af-ZA-AdriNeural", + "Gender": "Female", + "Locale": "af-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Adri Online (Natural) - Afrikaans (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (af-ZA, WillemNeural)", + "ShortName": "af-ZA-WillemNeural", + "Gender": "Male", + "Locale": "af-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Willem Online (Natural) - Afrikaans (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sq-AL, AnilaNeural)", + "ShortName": "sq-AL-AnilaNeural", + "Gender": "Female", + "Locale": "sq-AL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Anila Online (Natural) - Albanian (Albania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sq-AL, IlirNeural)", + "ShortName": "sq-AL-IlirNeural", + "Gender": "Male", + "Locale": "sq-AL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ilir Online (Natural) - Albanian (Albania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (am-ET, AmehaNeural)", + "ShortName": "am-ET-AmehaNeural", + "Gender": "Male", + "Locale": "am-ET", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ameha Online (Natural) - Amharic (Ethiopia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (am-ET, MekdesNeural)", + "ShortName": "am-ET-MekdesNeural", + "Gender": "Female", + "Locale": "am-ET", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mekdes Online (Natural) - Amharic (Ethiopia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-DZ, AminaNeural)", + "ShortName": "ar-DZ-AminaNeural", + "Gender": "Female", + "Locale": "ar-DZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amina Online (Natural) - Arabic (Algeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-DZ, IsmaelNeural)", + "ShortName": "ar-DZ-IsmaelNeural", + "Gender": "Male", + "Locale": "ar-DZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ismael Online (Natural) - Arabic (Algeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-BH, AliNeural)", + "ShortName": "ar-BH-AliNeural", + "Gender": "Male", + "Locale": "ar-BH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ali Online (Natural) - Arabic (Bahrain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-BH, LailaNeural)", + "ShortName": "ar-BH-LailaNeural", + "Gender": "Female", + "Locale": "ar-BH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Laila Online (Natural) - Arabic (Bahrain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-EG, SalmaNeural)", + "ShortName": "ar-EG-SalmaNeural", + "Gender": "Female", + "Locale": "ar-EG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Salma Online (Natural) - Arabic (Egypt)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-EG, ShakirNeural)", + "ShortName": "ar-EG-ShakirNeural", + "Gender": "Male", + "Locale": "ar-EG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Shakir Online (Natural) - Arabic (Egypt)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-IQ, BasselNeural)", + "ShortName": "ar-IQ-BasselNeural", + "Gender": "Male", + "Locale": "ar-IQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Bassel Online (Natural) - Arabic (Iraq)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-IQ, RanaNeural)", + "ShortName": "ar-IQ-RanaNeural", + "Gender": "Female", + "Locale": "ar-IQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rana Online (Natural) - Arabic (Iraq)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-JO, SanaNeural)", + "ShortName": "ar-JO-SanaNeural", + "Gender": "Female", + "Locale": "ar-JO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sana Online (Natural) - Arabic (Jordan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-JO, TaimNeural)", + "ShortName": "ar-JO-TaimNeural", + "Gender": "Male", + "Locale": "ar-JO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Taim Online (Natural) - Arabic (Jordan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-KW, FahedNeural)", + "ShortName": "ar-KW-FahedNeural", + "Gender": "Male", + "Locale": "ar-KW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fahed Online (Natural) - Arabic (Kuwait)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-KW, NouraNeural)", + "ShortName": "ar-KW-NouraNeural", + "Gender": "Female", + "Locale": "ar-KW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Noura Online (Natural) - Arabic (Kuwait)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LB, LaylaNeural)", + "ShortName": "ar-LB-LaylaNeural", + "Gender": "Female", + "Locale": "ar-LB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Layla Online (Natural) - Arabic (Lebanon)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LB, RamiNeural)", + "ShortName": "ar-LB-RamiNeural", + "Gender": "Male", + "Locale": "ar-LB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rami Online (Natural) - Arabic (Lebanon)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LY, ImanNeural)", + "ShortName": "ar-LY-ImanNeural", + "Gender": "Female", + "Locale": "ar-LY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Iman Online (Natural) - Arabic (Libya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LY, OmarNeural)", + "ShortName": "ar-LY-OmarNeural", + "Gender": "Male", + "Locale": "ar-LY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Omar Online (Natural) - Arabic (Libya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-MA, JamalNeural)", + "ShortName": "ar-MA-JamalNeural", + "Gender": "Male", + "Locale": "ar-MA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jamal Online (Natural) - Arabic (Morocco)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-MA, MounaNeural)", + "ShortName": "ar-MA-MounaNeural", + "Gender": "Female", + "Locale": "ar-MA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mouna Online (Natural) - Arabic (Morocco)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-OM, AbdullahNeural)", + "ShortName": "ar-OM-AbdullahNeural", + "Gender": "Male", + "Locale": "ar-OM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Abdullah Online (Natural) - Arabic (Oman)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-OM, AyshaNeural)", + "ShortName": "ar-OM-AyshaNeural", + "Gender": "Female", + "Locale": "ar-OM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aysha Online (Natural) - Arabic (Oman)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-QA, AmalNeural)", + "ShortName": "ar-QA-AmalNeural", + "Gender": "Female", + "Locale": "ar-QA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amal Online (Natural) - Arabic (Qatar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-QA, MoazNeural)", + "ShortName": "ar-QA-MoazNeural", + "Gender": "Male", + "Locale": "ar-QA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Moaz Online (Natural) - Arabic (Qatar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SA, HamedNeural)", + "ShortName": "ar-SA-HamedNeural", + "Gender": "Male", + "Locale": "ar-SA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hamed Online (Natural) - Arabic (Saudi Arabia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SA, ZariyahNeural)", + "ShortName": "ar-SA-ZariyahNeural", + "Gender": "Female", + "Locale": "ar-SA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Zariyah Online (Natural) - Arabic (Saudi Arabia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SY, AmanyNeural)", + "ShortName": "ar-SY-AmanyNeural", + "Gender": "Female", + "Locale": "ar-SY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amany Online (Natural) - Arabic (Syria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SY, LaithNeural)", + "ShortName": "ar-SY-LaithNeural", + "Gender": "Male", + "Locale": "ar-SY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Laith Online (Natural) - Arabic (Syria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-TN, HediNeural)", + "ShortName": "ar-TN-HediNeural", + "Gender": "Male", + "Locale": "ar-TN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hedi Online (Natural) - Arabic (Tunisia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-TN, ReemNeural)", + "ShortName": "ar-TN-ReemNeural", + "Gender": "Female", + "Locale": "ar-TN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Reem Online (Natural) - Arabic (Tunisia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-AE, FatimaNeural)", + "ShortName": "ar-AE-FatimaNeural", + "Gender": "Female", + "Locale": "ar-AE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fatima Online (Natural) - Arabic (United Arab Emirates)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-AE, HamdanNeural)", + "ShortName": "ar-AE-HamdanNeural", + "Gender": "Male", + "Locale": "ar-AE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hamdan Online (Natural) - Arabic (United Arab Emirates)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-YE, MaryamNeural)", + "ShortName": "ar-YE-MaryamNeural", + "Gender": "Female", + "Locale": "ar-YE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maryam Online (Natural) - Arabic (Yemen)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-YE, SalehNeural)", + "ShortName": "ar-YE-SalehNeural", + "Gender": "Male", + "Locale": "ar-YE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Saleh Online (Natural) - Arabic (Yemen)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (az-AZ, BabekNeural)", + "ShortName": "az-AZ-BabekNeural", + "Gender": "Male", + "Locale": "az-AZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Babek Online (Natural) - Azerbaijani (Azerbaijan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (az-AZ, BanuNeural)", + "ShortName": "az-AZ-BanuNeural", + "Gender": "Female", + "Locale": "az-AZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Banu Online (Natural) - Azerbaijani (Azerbaijan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-BD, NabanitaNeural)", + "ShortName": "bn-BD-NabanitaNeural", + "Gender": "Female", + "Locale": "bn-BD", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nabanita Online (Natural) - Bangla (Bangladesh)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-BD, PradeepNeural)", + "ShortName": "bn-BD-PradeepNeural", + "Gender": "Male", + "Locale": "bn-BD", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Pradeep Online (Natural) - Bangla (Bangladesh)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-IN, BashkarNeural)", + "ShortName": "bn-IN-BashkarNeural", + "Gender": "Male", + "Locale": "bn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Bashkar Online (Natural) - Bangla (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-IN, TanishaaNeural)", + "ShortName": "bn-IN-TanishaaNeural", + "Gender": "Female", + "Locale": "bn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tanishaa Online (Natural) - Bengali (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bs-BA, VesnaNeural)", + "ShortName": "bs-BA-VesnaNeural", + "Gender": "Female", + "Locale": "bs-BA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Vesna Online (Natural) - Bosnian (Bosnia and Herzegovina)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bs-BA, GoranNeural)", + "ShortName": "bs-BA-GoranNeural", + "Gender": "Male", + "Locale": "bs-BA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Goran Online (Natural) - Bosnian (Bosnia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bg-BG, BorislavNeural)", + "ShortName": "bg-BG-BorislavNeural", + "Gender": "Male", + "Locale": "bg-BG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Borislav Online (Natural) - Bulgarian (Bulgaria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bg-BG, KalinaNeural)", + "ShortName": "bg-BG-KalinaNeural", + "Gender": "Female", + "Locale": "bg-BG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kalina Online (Natural) - Bulgarian (Bulgaria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (my-MM, NilarNeural)", + "ShortName": "my-MM-NilarNeural", + "Gender": "Female", + "Locale": "my-MM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nilar Online (Natural) - Burmese (Myanmar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (my-MM, ThihaNeural)", + "ShortName": "my-MM-ThihaNeural", + "Gender": "Male", + "Locale": "my-MM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thiha Online (Natural) - Burmese (Myanmar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ca-ES, EnricNeural)", + "ShortName": "ca-ES-EnricNeural", + "Gender": "Male", + "Locale": "ca-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Enric Online (Natural) - Catalan", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ca-ES, JoanaNeural)", + "ShortName": "ca-ES-JoanaNeural", + "Gender": "Female", + "Locale": "ca-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Joana Online (Natural) - Catalan", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, HiuGaaiNeural)", + "ShortName": "zh-HK-HiuGaaiNeural", + "Gender": "Female", + "Locale": "zh-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HiuGaai Online (Natural) - Chinese (Cantonese Traditional)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, HiuMaanNeural)", + "ShortName": "zh-HK-HiuMaanNeural", + "Gender": "Female", + "Locale": "zh-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HiuMaan Online (Natural) - Chinese (Hong Kong SAR)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, WanLungNeural)", + "ShortName": "zh-HK-WanLungNeural", + "Gender": "Male", + "Locale": "zh-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft WanLung Online (Natural) - Chinese (Hong Kong SAR)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)", + "ShortName": "zh-CN-XiaoxiaoNeural", + "Gender": "Female", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaoxiao Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Warm" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoyiNeural)", + "ShortName": "zh-CN-XiaoyiNeural", + "Gender": "Female", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaoyi Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Cartoon", + "Novel" + ], + "VoicePersonalities": [ + "Lively" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunjianNeural)", + "ShortName": "zh-CN-YunjianNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunjian Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Sports", + " Novel" + ], + "VoicePersonalities": [ + "Passion" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunxiNeural)", + "ShortName": "zh-CN-YunxiNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunxi Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Novel" + ], + "VoicePersonalities": [ + "Lively", + "Sunshine" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunxiaNeural)", + "ShortName": "zh-CN-YunxiaNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunxia Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Cartoon", + "Novel" + ], + "VoicePersonalities": [ + "Cute" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunyangNeural)", + "ShortName": "zh-CN-YunyangNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunyang Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News" + ], + "VoicePersonalities": [ + "Professional", + "Reliable" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN-liaoning, XiaobeiNeural)", + "ShortName": "zh-CN-liaoning-XiaobeiNeural", + "Gender": "Female", + "Locale": "zh-CN-liaoning", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaobei Online (Natural) - Chinese (Northeastern Mandarin)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Dialect" + ], + "VoicePersonalities": [ + "Humorous" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, HsiaoChenNeural)", + "ShortName": "zh-TW-HsiaoChenNeural", + "Gender": "Female", + "Locale": "zh-TW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HsiaoChen Online (Natural) - Chinese (Taiwan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, YunJheNeural)", + "ShortName": "zh-TW-YunJheNeural", + "Gender": "Male", + "Locale": "zh-TW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft YunJhe Online (Natural) - Chinese (Taiwan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, HsiaoYuNeural)", + "ShortName": "zh-TW-HsiaoYuNeural", + "Gender": "Female", + "Locale": "zh-TW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HsiaoYu Online (Natural) - Chinese (Taiwanese Mandarin)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN-shaanxi, XiaoniNeural)", + "ShortName": "zh-CN-shaanxi-XiaoniNeural", + "Gender": "Female", + "Locale": "zh-CN-shaanxi", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaoni Online (Natural) - Chinese (Zhongyuan Mandarin Shaanxi)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Dialect" + ], + "VoicePersonalities": [ + "Bright" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hr-HR, GabrijelaNeural)", + "ShortName": "hr-HR-GabrijelaNeural", + "Gender": "Female", + "Locale": "hr-HR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gabrijela Online (Natural) - Croatian (Croatia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hr-HR, SreckoNeural)", + "ShortName": "hr-HR-SreckoNeural", + "Gender": "Male", + "Locale": "hr-HR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Srecko Online (Natural) - Croatian (Croatia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cs-CZ, AntoninNeural)", + "ShortName": "cs-CZ-AntoninNeural", + "Gender": "Male", + "Locale": "cs-CZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Antonin Online (Natural) - Czech (Czech)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cs-CZ, VlastaNeural)", + "ShortName": "cs-CZ-VlastaNeural", + "Gender": "Female", + "Locale": "cs-CZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Vlasta Online (Natural) - Czech (Czech)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (da-DK, ChristelNeural)", + "ShortName": "da-DK-ChristelNeural", + "Gender": "Female", + "Locale": "da-DK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Christel Online (Natural) - Danish (Denmark)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (da-DK, JeppeNeural)", + "ShortName": "da-DK-JeppeNeural", + "Gender": "Male", + "Locale": "da-DK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jeppe Online (Natural) - Danish (Denmark)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-BE, ArnaudNeural)", + "ShortName": "nl-BE-ArnaudNeural", + "Gender": "Male", + "Locale": "nl-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Arnaud Online (Natural) - Dutch (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-BE, DenaNeural)", + "ShortName": "nl-BE-DenaNeural", + "Gender": "Female", + "Locale": "nl-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dena Online (Natural) - Dutch (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, ColetteNeural)", + "ShortName": "nl-NL-ColetteNeural", + "Gender": "Female", + "Locale": "nl-NL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Colette Online (Natural) - Dutch (Netherlands)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, FennaNeural)", + "ShortName": "nl-NL-FennaNeural", + "Gender": "Female", + "Locale": "nl-NL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fenna Online (Natural) - Dutch (Netherlands)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, MaartenNeural)", + "ShortName": "nl-NL-MaartenNeural", + "Gender": "Male", + "Locale": "nl-NL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maarten Online (Natural) - Dutch (Netherlands)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-AU, NatashaNeural)", + "ShortName": "en-AU-NatashaNeural", + "Gender": "Female", + "Locale": "en-AU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Natasha Online (Natural) - English (Australia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-AU, WilliamNeural)", + "ShortName": "en-AU-WilliamNeural", + "Gender": "Male", + "Locale": "en-AU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft William Online (Natural) - English (Australia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-CA, ClaraNeural)", + "ShortName": "en-CA-ClaraNeural", + "Gender": "Female", + "Locale": "en-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Clara Online (Natural) - English (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-CA, LiamNeural)", + "ShortName": "en-CA-LiamNeural", + "Gender": "Male", + "Locale": "en-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Liam Online (Natural) - English (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-HK, YanNeural)", + "ShortName": "en-HK-YanNeural", + "Gender": "Female", + "Locale": "en-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yan Online (Natural) - English (Hong Kong SAR)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-HK, SamNeural)", + "ShortName": "en-HK-SamNeural", + "Gender": "Male", + "Locale": "en-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sam Online (Natural) - English (Hongkong)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, NeerjaExpressiveNeural)", + "ShortName": "en-IN-NeerjaExpressiveNeural", + "Gender": "Female", + "Locale": "en-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Neerja Online (Natural) - English (India) (Preview)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, NeerjaNeural)", + "ShortName": "en-IN-NeerjaNeural", + "Gender": "Female", + "Locale": "en-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Neerja Online (Natural) - English (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, PrabhatNeural)", + "ShortName": "en-IN-PrabhatNeural", + "Gender": "Male", + "Locale": "en-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Prabhat Online (Natural) - English (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IE, ConnorNeural)", + "ShortName": "en-IE-ConnorNeural", + "Gender": "Male", + "Locale": "en-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Connor Online (Natural) - English (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IE, EmilyNeural)", + "ShortName": "en-IE-EmilyNeural", + "Gender": "Female", + "Locale": "en-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emily Online (Natural) - English (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-KE, AsiliaNeural)", + "ShortName": "en-KE-AsiliaNeural", + "Gender": "Female", + "Locale": "en-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Asilia Online (Natural) - English (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-KE, ChilembaNeural)", + "ShortName": "en-KE-ChilembaNeural", + "Gender": "Male", + "Locale": "en-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Chilemba Online (Natural) - English (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NZ, MitchellNeural)", + "ShortName": "en-NZ-MitchellNeural", + "Gender": "Male", + "Locale": "en-NZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mitchell Online (Natural) - English (New Zealand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NZ, MollyNeural)", + "ShortName": "en-NZ-MollyNeural", + "Gender": "Female", + "Locale": "en-NZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Molly Online (Natural) - English (New Zealand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NG, AbeoNeural)", + "ShortName": "en-NG-AbeoNeural", + "Gender": "Male", + "Locale": "en-NG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Abeo Online (Natural) - English (Nigeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NG, EzinneNeural)", + "ShortName": "en-NG-EzinneNeural", + "Gender": "Female", + "Locale": "en-NG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ezinne Online (Natural) - English (Nigeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-PH, JamesNeural)", + "ShortName": "en-PH-JamesNeural", + "Gender": "Male", + "Locale": "en-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft James Online (Natural) - English (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-PH, RosaNeural)", + "ShortName": "en-PH-RosaNeural", + "Gender": "Female", + "Locale": "en-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rosa Online (Natural) - English (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-SG, LunaNeural)", + "ShortName": "en-SG-LunaNeural", + "Gender": "Female", + "Locale": "en-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Luna Online (Natural) - English (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-SG, WayneNeural)", + "ShortName": "en-SG-WayneNeural", + "Gender": "Male", + "Locale": "en-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Wayne Online (Natural) - English (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AvaMultilingualNeural)", + "ShortName": "en-US-AvaMultilingualNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft AvaMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Expressive", + "Caring", + "Pleasant", + "Friendly" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AndrewMultilingualNeural)", + "ShortName": "en-US-AndrewMultilingualNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft AndrewMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Warm", + "Confident", + "Authentic", + "Honest" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EmmaMultilingualNeural)", + "ShortName": "en-US-EmmaMultilingualNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft EmmaMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Cheerful", + "Clear", + "Conversational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, BrianMultilingualNeural)", + "ShortName": "en-US-BrianMultilingualNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft BrianMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Approachable", + "Casual", + "Sincere" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AvaNeural)", + "ShortName": "en-US-AvaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ava Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Expressive", + "Caring", + "Pleasant", + "Friendly" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AndrewNeural)", + "ShortName": "en-US-AndrewNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Andrew Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Warm", + "Confident", + "Authentic", + "Honest" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EmmaNeural)", + "ShortName": "en-US-EmmaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emma Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Cheerful", + "Clear", + "Conversational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, BrianNeural)", + "ShortName": "en-US-BrianNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Brian Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Approachable", + "Casual", + "Sincere" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-ZA, LeahNeural)", + "ShortName": "en-ZA-LeahNeural", + "Gender": "Female", + "Locale": "en-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Leah Online (Natural) - English (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-ZA, LukeNeural)", + "ShortName": "en-ZA-LukeNeural", + "Gender": "Male", + "Locale": "en-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Luke Online (Natural) - English (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-TZ, ElimuNeural)", + "ShortName": "en-TZ-ElimuNeural", + "Gender": "Male", + "Locale": "en-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elimu Online (Natural) - English (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-TZ, ImaniNeural)", + "ShortName": "en-TZ-ImaniNeural", + "Gender": "Female", + "Locale": "en-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Imani Online (Natural) - English (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, LibbyNeural)", + "ShortName": "en-GB-LibbyNeural", + "Gender": "Female", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Libby Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, MaisieNeural)", + "ShortName": "en-GB-MaisieNeural", + "Gender": "Female", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maisie Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, RyanNeural)", + "ShortName": "en-GB-RyanNeural", + "Gender": "Male", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ryan Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, SoniaNeural)", + "ShortName": "en-GB-SoniaNeural", + "Gender": "Female", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sonia Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, ThomasNeural)", + "ShortName": "en-GB-ThomasNeural", + "Gender": "Male", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thomas Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AnaNeural)", + "ShortName": "en-US-AnaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ana Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Cartoon", + "Conversation" + ], + "VoicePersonalities": [ + "Cute" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)", + "ShortName": "en-US-AriaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aria Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Positive", + "Confident" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, ChristopherNeural)", + "ShortName": "en-US-ChristopherNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Christopher Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Reliable", + "Authority" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EricNeural)", + "ShortName": "en-US-EricNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Eric Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Rational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, GuyNeural)", + "ShortName": "en-US-GuyNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Guy Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Passion" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)", + "ShortName": "en-US-JennyNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jenny Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Considerate", + "Comfort" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, MichelleNeural)", + "ShortName": "en-US-MichelleNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Michelle Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Friendly", + "Pleasant" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, RogerNeural)", + "ShortName": "en-US-RogerNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Roger Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Lively" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, SteffanNeural)", + "ShortName": "en-US-SteffanNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Steffan Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Rational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (et-EE, AnuNeural)", + "ShortName": "et-EE-AnuNeural", + "Gender": "Female", + "Locale": "et-EE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Anu Online (Natural) - Estonian (Estonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (et-EE, KertNeural)", + "ShortName": "et-EE-KertNeural", + "Gender": "Male", + "Locale": "et-EE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kert Online (Natural) - Estonian (Estonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fil-PH, AngeloNeural)", + "ShortName": "fil-PH-AngeloNeural", + "Gender": "Male", + "Locale": "fil-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Angelo Online (Natural) - Filipino (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fil-PH, BlessicaNeural)", + "ShortName": "fil-PH-BlessicaNeural", + "Gender": "Female", + "Locale": "fil-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Blessica Online (Natural) - Filipino (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fi-FI, HarriNeural)", + "ShortName": "fi-FI-HarriNeural", + "Gender": "Male", + "Locale": "fi-FI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Harri Online (Natural) - Finnish (Finland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fi-FI, NooraNeural)", + "ShortName": "fi-FI-NooraNeural", + "Gender": "Female", + "Locale": "fi-FI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Noora Online (Natural) - Finnish (Finland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-BE, CharlineNeural)", + "ShortName": "fr-BE-CharlineNeural", + "Gender": "Female", + "Locale": "fr-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Charline Online (Natural) - French (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-BE, GerardNeural)", + "ShortName": "fr-BE-GerardNeural", + "Gender": "Male", + "Locale": "fr-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gerard Online (Natural) - French (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, ThierryNeural)", + "ShortName": "fr-CA-ThierryNeural", + "Gender": "Male", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thierry Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, AntoineNeural)", + "ShortName": "fr-CA-AntoineNeural", + "Gender": "Male", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Antoine Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, JeanNeural)", + "ShortName": "fr-CA-JeanNeural", + "Gender": "Male", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jean Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, SylvieNeural)", + "ShortName": "fr-CA-SylvieNeural", + "Gender": "Female", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sylvie Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, VivienneMultilingualNeural)", + "ShortName": "fr-FR-VivienneMultilingualNeural", + "Gender": "Female", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft VivienneMultilingual Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, RemyMultilingualNeural)", + "ShortName": "fr-FR-RemyMultilingualNeural", + "Gender": "Male", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft RemyMultilingual Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, DeniseNeural)", + "ShortName": "fr-FR-DeniseNeural", + "Gender": "Female", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Denise Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, EloiseNeural)", + "ShortName": "fr-FR-EloiseNeural", + "Gender": "Female", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Eloise Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, HenriNeural)", + "ShortName": "fr-FR-HenriNeural", + "Gender": "Male", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Henri Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CH, ArianeNeural)", + "ShortName": "fr-CH-ArianeNeural", + "Gender": "Female", + "Locale": "fr-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ariane Online (Natural) - French (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CH, FabriceNeural)", + "ShortName": "fr-CH-FabriceNeural", + "Gender": "Male", + "Locale": "fr-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fabrice Online (Natural) - French (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gl-ES, RoiNeural)", + "ShortName": "gl-ES-RoiNeural", + "Gender": "Male", + "Locale": "gl-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Roi Online (Natural) - Galician", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gl-ES, SabelaNeural)", + "ShortName": "gl-ES-SabelaNeural", + "Gender": "Female", + "Locale": "gl-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sabela Online (Natural) - Galician", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ka-GE, EkaNeural)", + "ShortName": "ka-GE-EkaNeural", + "Gender": "Female", + "Locale": "ka-GE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Eka Online (Natural) - Georgian (Georgia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ka-GE, GiorgiNeural)", + "ShortName": "ka-GE-GiorgiNeural", + "Gender": "Male", + "Locale": "ka-GE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Giorgi Online (Natural) - Georgian (Georgia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-AT, IngridNeural)", + "ShortName": "de-AT-IngridNeural", + "Gender": "Female", + "Locale": "de-AT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ingrid Online (Natural) - German (Austria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-AT, JonasNeural)", + "ShortName": "de-AT-JonasNeural", + "Gender": "Male", + "Locale": "de-AT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jonas Online (Natural) - German (Austria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, SeraphinaMultilingualNeural)", + "ShortName": "de-DE-SeraphinaMultilingualNeural", + "Gender": "Female", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft SeraphinaMultilingual Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, FlorianMultilingualNeural)", + "ShortName": "de-DE-FlorianMultilingualNeural", + "Gender": "Male", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft FlorianMultilingual Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, AmalaNeural)", + "ShortName": "de-DE-AmalaNeural", + "Gender": "Female", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amala Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, ConradNeural)", + "ShortName": "de-DE-ConradNeural", + "Gender": "Male", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Conrad Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, KatjaNeural)", + "ShortName": "de-DE-KatjaNeural", + "Gender": "Female", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Katja Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, KillianNeural)", + "ShortName": "de-DE-KillianNeural", + "Gender": "Male", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Killian Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-CH, JanNeural)", + "ShortName": "de-CH-JanNeural", + "Gender": "Male", + "Locale": "de-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jan Online (Natural) - German (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-CH, LeniNeural)", + "ShortName": "de-CH-LeniNeural", + "Gender": "Female", + "Locale": "de-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Leni Online (Natural) - German (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (el-GR, AthinaNeural)", + "ShortName": "el-GR-AthinaNeural", + "Gender": "Female", + "Locale": "el-GR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Athina Online (Natural) - Greek (Greece)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (el-GR, NestorasNeural)", + "ShortName": "el-GR-NestorasNeural", + "Gender": "Male", + "Locale": "el-GR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nestoras Online (Natural) - Greek (Greece)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gu-IN, DhwaniNeural)", + "ShortName": "gu-IN-DhwaniNeural", + "Gender": "Female", + "Locale": "gu-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dhwani Online (Natural) - Gujarati (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gu-IN, NiranjanNeural)", + "ShortName": "gu-IN-NiranjanNeural", + "Gender": "Male", + "Locale": "gu-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Niranjan Online (Natural) - Gujarati (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (he-IL, AvriNeural)", + "ShortName": "he-IL-AvriNeural", + "Gender": "Male", + "Locale": "he-IL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Avri Online (Natural) - Hebrew (Israel)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (he-IL, HilaNeural)", + "ShortName": "he-IL-HilaNeural", + "Gender": "Female", + "Locale": "he-IL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hila Online (Natural) - Hebrew (Israel)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hi-IN, MadhurNeural)", + "ShortName": "hi-IN-MadhurNeural", + "Gender": "Male", + "Locale": "hi-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Madhur Online (Natural) - Hindi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hi-IN, SwaraNeural)", + "ShortName": "hi-IN-SwaraNeural", + "Gender": "Female", + "Locale": "hi-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Swara Online (Natural) - Hindi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hu-HU, NoemiNeural)", + "ShortName": "hu-HU-NoemiNeural", + "Gender": "Female", + "Locale": "hu-HU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Noemi Online (Natural) - Hungarian (Hungary)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hu-HU, TamasNeural)", + "ShortName": "hu-HU-TamasNeural", + "Gender": "Male", + "Locale": "hu-HU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tamas Online (Natural) - Hungarian (Hungary)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (is-IS, GudrunNeural)", + "ShortName": "is-IS-GudrunNeural", + "Gender": "Female", + "Locale": "is-IS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gudrun Online (Natural) - Icelandic (Iceland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (is-IS, GunnarNeural)", + "ShortName": "is-IS-GunnarNeural", + "Gender": "Male", + "Locale": "is-IS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gunnar Online (Natural) - Icelandic (Iceland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (id-ID, ArdiNeural)", + "ShortName": "id-ID-ArdiNeural", + "Gender": "Male", + "Locale": "id-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ardi Online (Natural) - Indonesian (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (id-ID, GadisNeural)", + "ShortName": "id-ID-GadisNeural", + "Gender": "Female", + "Locale": "id-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gadis Online (Natural) - Indonesian (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (iu-Latn-CA, SiqiniqNeural)", + "ShortName": "iu-Latn-CA-SiqiniqNeural", + "Gender": "Female", + "Locale": "iu-Latn-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Siqiniq Online (Natural) - Inuktitut (Latin, Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (iu-Latn-CA, TaqqiqNeural)", + "ShortName": "iu-Latn-CA-TaqqiqNeural", + "Gender": "Male", + "Locale": "iu-Latn-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Taqqiq Online (Natural) - Inuktitut (Latin, Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (iu-Cans-CA, SiqiniqNeural)", + "ShortName": "iu-Cans-CA-SiqiniqNeural", + "Gender": "Female", + "Locale": "iu-Cans-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Siqiniq Online (Natural) - Inuktitut (Syllabics, Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (iu-Cans-CA, TaqqiqNeural)", + "ShortName": "iu-Cans-CA-TaqqiqNeural", + "Gender": "Male", + "Locale": "iu-Cans-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Taqqiq Online (Natural) - Inuktitut (Syllabics, Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ga-IE, ColmNeural)", + "ShortName": "ga-IE-ColmNeural", + "Gender": "Male", + "Locale": "ga-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Colm Online (Natural) - Irish (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ga-IE, OrlaNeural)", + "ShortName": "ga-IE-OrlaNeural", + "Gender": "Female", + "Locale": "ga-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Orla Online (Natural) - Irish (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, GiuseppeMultilingualNeural)", + "ShortName": "it-IT-GiuseppeMultilingualNeural", + "Gender": "Male", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft GiuseppeMultilingual Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, DiegoNeural)", + "ShortName": "it-IT-DiegoNeural", + "Gender": "Male", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Diego Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, ElsaNeural)", + "ShortName": "it-IT-ElsaNeural", + "Gender": "Female", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elsa Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, IsabellaNeural)", + "ShortName": "it-IT-IsabellaNeural", + "Gender": "Female", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Isabella Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ja-JP, KeitaNeural)", + "ShortName": "ja-JP-KeitaNeural", + "Gender": "Male", + "Locale": "ja-JP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Keita Online (Natural) - Japanese (Japan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ja-JP, NanamiNeural)", + "ShortName": "ja-JP-NanamiNeural", + "Gender": "Female", + "Locale": "ja-JP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nanami Online (Natural) - Japanese (Japan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (jv-ID, DimasNeural)", + "ShortName": "jv-ID-DimasNeural", + "Gender": "Male", + "Locale": "jv-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dimas Online (Natural) - Javanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (jv-ID, SitiNeural)", + "ShortName": "jv-ID-SitiNeural", + "Gender": "Female", + "Locale": "jv-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Siti Online (Natural) - Javanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kn-IN, GaganNeural)", + "ShortName": "kn-IN-GaganNeural", + "Gender": "Male", + "Locale": "kn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gagan Online (Natural) - Kannada (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kn-IN, SapnaNeural)", + "ShortName": "kn-IN-SapnaNeural", + "Gender": "Female", + "Locale": "kn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sapna Online (Natural) - Kannada (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kk-KZ, AigulNeural)", + "ShortName": "kk-KZ-AigulNeural", + "Gender": "Female", + "Locale": "kk-KZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aigul Online (Natural) - Kazakh (Kazakhstan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kk-KZ, DauletNeural)", + "ShortName": "kk-KZ-DauletNeural", + "Gender": "Male", + "Locale": "kk-KZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Daulet Online (Natural) - Kazakh (Kazakhstan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (km-KH, PisethNeural)", + "ShortName": "km-KH-PisethNeural", + "Gender": "Male", + "Locale": "km-KH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Piseth Online (Natural) - Khmer (Cambodia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (km-KH, SreymomNeural)", + "ShortName": "km-KH-SreymomNeural", + "Gender": "Female", + "Locale": "km-KH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sreymom Online (Natural) - Khmer (Cambodia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, HyunsuMultilingualNeural)", + "ShortName": "ko-KR-HyunsuMultilingualNeural", + "Gender": "Male", + "Locale": "ko-KR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HyunsuMultilingual Online (Natural) - Korean (Korea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, InJoonNeural)", + "ShortName": "ko-KR-InJoonNeural", + "Gender": "Male", + "Locale": "ko-KR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft InJoon Online (Natural) - Korean (Korea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, SunHiNeural)", + "ShortName": "ko-KR-SunHiNeural", + "Gender": "Female", + "Locale": "ko-KR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft SunHi Online (Natural) - Korean (Korea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lo-LA, ChanthavongNeural)", + "ShortName": "lo-LA-ChanthavongNeural", + "Gender": "Male", + "Locale": "lo-LA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Chanthavong Online (Natural) - Lao (Laos)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lo-LA, KeomanyNeural)", + "ShortName": "lo-LA-KeomanyNeural", + "Gender": "Female", + "Locale": "lo-LA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Keomany Online (Natural) - Lao (Laos)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lv-LV, EveritaNeural)", + "ShortName": "lv-LV-EveritaNeural", + "Gender": "Female", + "Locale": "lv-LV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Everita Online (Natural) - Latvian (Latvia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lv-LV, NilsNeural)", + "ShortName": "lv-LV-NilsNeural", + "Gender": "Male", + "Locale": "lv-LV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nils Online (Natural) - Latvian (Latvia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lt-LT, LeonasNeural)", + "ShortName": "lt-LT-LeonasNeural", + "Gender": "Male", + "Locale": "lt-LT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Leonas Online (Natural) - Lithuanian (Lithuania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lt-LT, OnaNeural)", + "ShortName": "lt-LT-OnaNeural", + "Gender": "Female", + "Locale": "lt-LT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ona Online (Natural) - Lithuanian (Lithuania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mk-MK, AleksandarNeural)", + "ShortName": "mk-MK-AleksandarNeural", + "Gender": "Male", + "Locale": "mk-MK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aleksandar Online (Natural) - Macedonian (North Macedonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mk-MK, MarijaNeural)", + "ShortName": "mk-MK-MarijaNeural", + "Gender": "Female", + "Locale": "mk-MK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marija Online (Natural) - Macedonian (North Macedonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ms-MY, OsmanNeural)", + "ShortName": "ms-MY-OsmanNeural", + "Gender": "Male", + "Locale": "ms-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Osman Online (Natural) - Malay (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ms-MY, YasminNeural)", + "ShortName": "ms-MY-YasminNeural", + "Gender": "Female", + "Locale": "ms-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yasmin Online (Natural) - Malay (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ml-IN, MidhunNeural)", + "ShortName": "ml-IN-MidhunNeural", + "Gender": "Male", + "Locale": "ml-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Midhun Online (Natural) - Malayalam (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ml-IN, SobhanaNeural)", + "ShortName": "ml-IN-SobhanaNeural", + "Gender": "Female", + "Locale": "ml-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sobhana Online (Natural) - Malayalam (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mt-MT, GraceNeural)", + "ShortName": "mt-MT-GraceNeural", + "Gender": "Female", + "Locale": "mt-MT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Grace Online (Natural) - Maltese (Malta)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mt-MT, JosephNeural)", + "ShortName": "mt-MT-JosephNeural", + "Gender": "Male", + "Locale": "mt-MT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Joseph Online (Natural) - Maltese (Malta)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mr-IN, AarohiNeural)", + "ShortName": "mr-IN-AarohiNeural", + "Gender": "Female", + "Locale": "mr-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aarohi Online (Natural) - Marathi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mr-IN, ManoharNeural)", + "ShortName": "mr-IN-ManoharNeural", + "Gender": "Male", + "Locale": "mr-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Manohar Online (Natural) - Marathi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mn-MN, BataaNeural)", + "ShortName": "mn-MN-BataaNeural", + "Gender": "Male", + "Locale": "mn-MN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Bataa Online (Natural) - Mongolian (Mongolia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mn-MN, YesuiNeural)", + "ShortName": "mn-MN-YesuiNeural", + "Gender": "Female", + "Locale": "mn-MN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yesui Online (Natural) - Mongolian (Mongolia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ne-NP, HemkalaNeural)", + "ShortName": "ne-NP-HemkalaNeural", + "Gender": "Female", + "Locale": "ne-NP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hemkala Online (Natural) - Nepali (Nepal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ne-NP, SagarNeural)", + "ShortName": "ne-NP-SagarNeural", + "Gender": "Male", + "Locale": "ne-NP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sagar Online (Natural) - Nepali (Nepal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nb-NO, FinnNeural)", + "ShortName": "nb-NO-FinnNeural", + "Gender": "Male", + "Locale": "nb-NO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Finn Online (Natural) - Norwegian (Bokmål Norway)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nb-NO, PernilleNeural)", + "ShortName": "nb-NO-PernilleNeural", + "Gender": "Female", + "Locale": "nb-NO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Pernille Online (Natural) - Norwegian (Bokmål, Norway)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ps-AF, GulNawazNeural)", + "ShortName": "ps-AF-GulNawazNeural", + "Gender": "Male", + "Locale": "ps-AF", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft GulNawaz Online (Natural) - Pashto (Afghanistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ps-AF, LatifaNeural)", + "ShortName": "ps-AF-LatifaNeural", + "Gender": "Female", + "Locale": "ps-AF", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Latifa Online (Natural) - Pashto (Afghanistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fa-IR, DilaraNeural)", + "ShortName": "fa-IR-DilaraNeural", + "Gender": "Female", + "Locale": "fa-IR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dilara Online (Natural) - Persian (Iran)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fa-IR, FaridNeural)", + "ShortName": "fa-IR-FaridNeural", + "Gender": "Male", + "Locale": "fa-IR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Farid Online (Natural) - Persian (Iran)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pl-PL, MarekNeural)", + "ShortName": "pl-PL-MarekNeural", + "Gender": "Male", + "Locale": "pl-PL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marek Online (Natural) - Polish (Poland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pl-PL, ZofiaNeural)", + "ShortName": "pl-PL-ZofiaNeural", + "Gender": "Female", + "Locale": "pl-PL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Zofia Online (Natural) - Polish (Poland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, ThalitaMultilingualNeural)", + "ShortName": "pt-BR-ThalitaMultilingualNeural", + "Gender": "Female", + "Locale": "pt-BR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft ThalitaMultilingual Online (Natural) - Portuguese (Brazil)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, AntonioNeural)", + "ShortName": "pt-BR-AntonioNeural", + "Gender": "Male", + "Locale": "pt-BR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Antonio Online (Natural) - Portuguese (Brazil)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, FranciscaNeural)", + "ShortName": "pt-BR-FranciscaNeural", + "Gender": "Female", + "Locale": "pt-BR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Francisca Online (Natural) - Portuguese (Brazil)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-PT, DuarteNeural)", + "ShortName": "pt-PT-DuarteNeural", + "Gender": "Male", + "Locale": "pt-PT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Duarte Online (Natural) - Portuguese (Portugal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-PT, RaquelNeural)", + "ShortName": "pt-PT-RaquelNeural", + "Gender": "Female", + "Locale": "pt-PT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Raquel Online (Natural) - Portuguese (Portugal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ro-RO, AlinaNeural)", + "ShortName": "ro-RO-AlinaNeural", + "Gender": "Female", + "Locale": "ro-RO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alina Online (Natural) - Romanian (Romania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ro-RO, EmilNeural)", + "ShortName": "ro-RO-EmilNeural", + "Gender": "Male", + "Locale": "ro-RO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emil Online (Natural) - Romanian (Romania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ru-RU, DmitryNeural)", + "ShortName": "ru-RU-DmitryNeural", + "Gender": "Male", + "Locale": "ru-RU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dmitry Online (Natural) - Russian (Russia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ru-RU, SvetlanaNeural)", + "ShortName": "ru-RU-SvetlanaNeural", + "Gender": "Female", + "Locale": "ru-RU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Svetlana Online (Natural) - Russian (Russia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sr-RS, NicholasNeural)", + "ShortName": "sr-RS-NicholasNeural", + "Gender": "Male", + "Locale": "sr-RS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nicholas Online (Natural) - Serbian (Serbia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sr-RS, SophieNeural)", + "ShortName": "sr-RS-SophieNeural", + "Gender": "Female", + "Locale": "sr-RS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sophie Online (Natural) - Serbian (Serbia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (si-LK, SameeraNeural)", + "ShortName": "si-LK-SameeraNeural", + "Gender": "Male", + "Locale": "si-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sameera Online (Natural) - Sinhala (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (si-LK, ThiliniNeural)", + "ShortName": "si-LK-ThiliniNeural", + "Gender": "Female", + "Locale": "si-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thilini Online (Natural) - Sinhala (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sk-SK, LukasNeural)", + "ShortName": "sk-SK-LukasNeural", + "Gender": "Male", + "Locale": "sk-SK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Lukas Online (Natural) - Slovak (Slovakia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sk-SK, ViktoriaNeural)", + "ShortName": "sk-SK-ViktoriaNeural", + "Gender": "Female", + "Locale": "sk-SK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Viktoria Online (Natural) - Slovak (Slovakia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sl-SI, PetraNeural)", + "ShortName": "sl-SI-PetraNeural", + "Gender": "Female", + "Locale": "sl-SI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Petra Online (Natural) - Slovenian (Slovenia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sl-SI, RokNeural)", + "ShortName": "sl-SI-RokNeural", + "Gender": "Male", + "Locale": "sl-SI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rok Online (Natural) - Slovenian (Slovenia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (so-SO, MuuseNeural)", + "ShortName": "so-SO-MuuseNeural", + "Gender": "Male", + "Locale": "so-SO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Muuse Online (Natural) - Somali (Somalia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (so-SO, UbaxNeural)", + "ShortName": "so-SO-UbaxNeural", + "Gender": "Female", + "Locale": "so-SO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ubax Online (Natural) - Somali (Somalia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-AR, ElenaNeural)", + "ShortName": "es-AR-ElenaNeural", + "Gender": "Female", + "Locale": "es-AR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elena Online (Natural) - Spanish (Argentina)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-AR, TomasNeural)", + "ShortName": "es-AR-TomasNeural", + "Gender": "Male", + "Locale": "es-AR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tomas Online (Natural) - Spanish (Argentina)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-BO, MarceloNeural)", + "ShortName": "es-BO-MarceloNeural", + "Gender": "Male", + "Locale": "es-BO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marcelo Online (Natural) - Spanish (Bolivia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-BO, SofiaNeural)", + "ShortName": "es-BO-SofiaNeural", + "Gender": "Female", + "Locale": "es-BO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sofia Online (Natural) - Spanish (Bolivia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CL, CatalinaNeural)", + "ShortName": "es-CL-CatalinaNeural", + "Gender": "Female", + "Locale": "es-CL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Catalina Online (Natural) - Spanish (Chile)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CL, LorenzoNeural)", + "ShortName": "es-CL-LorenzoNeural", + "Gender": "Male", + "Locale": "es-CL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Lorenzo Online (Natural) - Spanish (Chile)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CO, GonzaloNeural)", + "ShortName": "es-CO-GonzaloNeural", + "Gender": "Male", + "Locale": "es-CO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gonzalo Online (Natural) - Spanish (Colombia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CO, SalomeNeural)", + "ShortName": "es-CO-SalomeNeural", + "Gender": "Female", + "Locale": "es-CO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Salome Online (Natural) - Spanish (Colombia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, XimenaNeural)", + "ShortName": "es-ES-XimenaNeural", + "Gender": "Female", + "Locale": "es-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ximena Online (Natural) - Spanish (Colombia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CR, JuanNeural)", + "ShortName": "es-CR-JuanNeural", + "Gender": "Male", + "Locale": "es-CR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Juan Online (Natural) - Spanish (Costa Rica)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CR, MariaNeural)", + "ShortName": "es-CR-MariaNeural", + "Gender": "Female", + "Locale": "es-CR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maria Online (Natural) - Spanish (Costa Rica)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CU, BelkysNeural)", + "ShortName": "es-CU-BelkysNeural", + "Gender": "Female", + "Locale": "es-CU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Belkys Online (Natural) - Spanish (Cuba)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CU, ManuelNeural)", + "ShortName": "es-CU-ManuelNeural", + "Gender": "Male", + "Locale": "es-CU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Manuel Online (Natural) - Spanish (Cuba)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-DO, EmilioNeural)", + "ShortName": "es-DO-EmilioNeural", + "Gender": "Male", + "Locale": "es-DO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emilio Online (Natural) - Spanish (Dominican Republic)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-DO, RamonaNeural)", + "ShortName": "es-DO-RamonaNeural", + "Gender": "Female", + "Locale": "es-DO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ramona Online (Natural) - Spanish (Dominican Republic)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-EC, AndreaNeural)", + "ShortName": "es-EC-AndreaNeural", + "Gender": "Female", + "Locale": "es-EC", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Andrea Online (Natural) - Spanish (Ecuador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-EC, LuisNeural)", + "ShortName": "es-EC-LuisNeural", + "Gender": "Male", + "Locale": "es-EC", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Luis Online (Natural) - Spanish (Ecuador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-SV, LorenaNeural)", + "ShortName": "es-SV-LorenaNeural", + "Gender": "Female", + "Locale": "es-SV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Lorena Online (Natural) - Spanish (El Salvador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-SV, RodrigoNeural)", + "ShortName": "es-SV-RodrigoNeural", + "Gender": "Male", + "Locale": "es-SV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rodrigo Online (Natural) - Spanish (El Salvador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GQ, JavierNeural)", + "ShortName": "es-GQ-JavierNeural", + "Gender": "Male", + "Locale": "es-GQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Javier Online (Natural) - Spanish (Equatorial Guinea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GQ, TeresaNeural)", + "ShortName": "es-GQ-TeresaNeural", + "Gender": "Female", + "Locale": "es-GQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Teresa Online (Natural) - Spanish (Equatorial Guinea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GT, AndresNeural)", + "ShortName": "es-GT-AndresNeural", + "Gender": "Male", + "Locale": "es-GT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Andres Online (Natural) - Spanish (Guatemala)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GT, MartaNeural)", + "ShortName": "es-GT-MartaNeural", + "Gender": "Female", + "Locale": "es-GT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marta Online (Natural) - Spanish (Guatemala)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-HN, CarlosNeural)", + "ShortName": "es-HN-CarlosNeural", + "Gender": "Male", + "Locale": "es-HN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Carlos Online (Natural) - Spanish (Honduras)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-HN, KarlaNeural)", + "ShortName": "es-HN-KarlaNeural", + "Gender": "Female", + "Locale": "es-HN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Karla Online (Natural) - Spanish (Honduras)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-MX, DaliaNeural)", + "ShortName": "es-MX-DaliaNeural", + "Gender": "Female", + "Locale": "es-MX", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dalia Online (Natural) - Spanish (Mexico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-MX, JorgeNeural)", + "ShortName": "es-MX-JorgeNeural", + "Gender": "Male", + "Locale": "es-MX", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jorge Online (Natural) - Spanish (Mexico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-NI, FedericoNeural)", + "ShortName": "es-NI-FedericoNeural", + "Gender": "Male", + "Locale": "es-NI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Federico Online (Natural) - Spanish (Nicaragua)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-NI, YolandaNeural)", + "ShortName": "es-NI-YolandaNeural", + "Gender": "Female", + "Locale": "es-NI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yolanda Online (Natural) - Spanish (Nicaragua)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PA, MargaritaNeural)", + "ShortName": "es-PA-MargaritaNeural", + "Gender": "Female", + "Locale": "es-PA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Margarita Online (Natural) - Spanish (Panama)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PA, RobertoNeural)", + "ShortName": "es-PA-RobertoNeural", + "Gender": "Male", + "Locale": "es-PA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Roberto Online (Natural) - Spanish (Panama)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PY, MarioNeural)", + "ShortName": "es-PY-MarioNeural", + "Gender": "Male", + "Locale": "es-PY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mario Online (Natural) - Spanish (Paraguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PY, TaniaNeural)", + "ShortName": "es-PY-TaniaNeural", + "Gender": "Female", + "Locale": "es-PY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tania Online (Natural) - Spanish (Paraguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PE, AlexNeural)", + "ShortName": "es-PE-AlexNeural", + "Gender": "Male", + "Locale": "es-PE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alex Online (Natural) - Spanish (Peru)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PE, CamilaNeural)", + "ShortName": "es-PE-CamilaNeural", + "Gender": "Female", + "Locale": "es-PE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Camila Online (Natural) - Spanish (Peru)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PR, KarinaNeural)", + "ShortName": "es-PR-KarinaNeural", + "Gender": "Female", + "Locale": "es-PR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Karina Online (Natural) - Spanish (Puerto Rico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PR, VictorNeural)", + "ShortName": "es-PR-VictorNeural", + "Gender": "Male", + "Locale": "es-PR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Victor Online (Natural) - Spanish (Puerto Rico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, AlvaroNeural)", + "ShortName": "es-ES-AlvaroNeural", + "Gender": "Male", + "Locale": "es-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alvaro Online (Natural) - Spanish (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, ElviraNeural)", + "ShortName": "es-ES-ElviraNeural", + "Gender": "Female", + "Locale": "es-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elvira Online (Natural) - Spanish (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-US, AlonsoNeural)", + "ShortName": "es-US-AlonsoNeural", + "Gender": "Male", + "Locale": "es-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alonso Online (Natural) - Spanish (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-US, PalomaNeural)", + "ShortName": "es-US-PalomaNeural", + "Gender": "Female", + "Locale": "es-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Paloma Online (Natural) - Spanish (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-UY, MateoNeural)", + "ShortName": "es-UY-MateoNeural", + "Gender": "Male", + "Locale": "es-UY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mateo Online (Natural) - Spanish (Uruguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-UY, ValentinaNeural)", + "ShortName": "es-UY-ValentinaNeural", + "Gender": "Female", + "Locale": "es-UY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Valentina Online (Natural) - Spanish (Uruguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-VE, PaolaNeural)", + "ShortName": "es-VE-PaolaNeural", + "Gender": "Female", + "Locale": "es-VE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Paola Online (Natural) - Spanish (Venezuela)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-VE, SebastianNeural)", + "ShortName": "es-VE-SebastianNeural", + "Gender": "Male", + "Locale": "es-VE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sebastian Online (Natural) - Spanish (Venezuela)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (su-ID, JajangNeural)", + "ShortName": "su-ID-JajangNeural", + "Gender": "Male", + "Locale": "su-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jajang Online (Natural) - Sundanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (su-ID, TutiNeural)", + "ShortName": "su-ID-TutiNeural", + "Gender": "Female", + "Locale": "su-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tuti Online (Natural) - Sundanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-KE, RafikiNeural)", + "ShortName": "sw-KE-RafikiNeural", + "Gender": "Male", + "Locale": "sw-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rafiki Online (Natural) - Swahili (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-KE, ZuriNeural)", + "ShortName": "sw-KE-ZuriNeural", + "Gender": "Female", + "Locale": "sw-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Zuri Online (Natural) - Swahili (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-TZ, DaudiNeural)", + "ShortName": "sw-TZ-DaudiNeural", + "Gender": "Male", + "Locale": "sw-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Daudi Online (Natural) - Swahili (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-TZ, RehemaNeural)", + "ShortName": "sw-TZ-RehemaNeural", + "Gender": "Female", + "Locale": "sw-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rehema Online (Natural) - Swahili (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sv-SE, MattiasNeural)", + "ShortName": "sv-SE-MattiasNeural", + "Gender": "Male", + "Locale": "sv-SE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mattias Online (Natural) - Swedish (Sweden)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sv-SE, SofieNeural)", + "ShortName": "sv-SE-SofieNeural", + "Gender": "Female", + "Locale": "sv-SE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sofie Online (Natural) - Swedish (Sweden)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-IN, PallaviNeural)", + "ShortName": "ta-IN-PallaviNeural", + "Gender": "Female", + "Locale": "ta-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Pallavi Online (Natural) - Tamil (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-IN, ValluvarNeural)", + "ShortName": "ta-IN-ValluvarNeural", + "Gender": "Male", + "Locale": "ta-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Valluvar Online (Natural) - Tamil (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-MY, KaniNeural)", + "ShortName": "ta-MY-KaniNeural", + "Gender": "Female", + "Locale": "ta-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kani Online (Natural) - Tamil (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-MY, SuryaNeural)", + "ShortName": "ta-MY-SuryaNeural", + "Gender": "Male", + "Locale": "ta-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Surya Online (Natural) - Tamil (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-SG, AnbuNeural)", + "ShortName": "ta-SG-AnbuNeural", + "Gender": "Male", + "Locale": "ta-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Anbu Online (Natural) - Tamil (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-SG, VenbaNeural)", + "ShortName": "ta-SG-VenbaNeural", + "Gender": "Female", + "Locale": "ta-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Venba Online (Natural) - Tamil (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-LK, KumarNeural)", + "ShortName": "ta-LK-KumarNeural", + "Gender": "Male", + "Locale": "ta-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kumar Online (Natural) - Tamil (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-LK, SaranyaNeural)", + "ShortName": "ta-LK-SaranyaNeural", + "Gender": "Female", + "Locale": "ta-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Saranya Online (Natural) - Tamil (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (te-IN, MohanNeural)", + "ShortName": "te-IN-MohanNeural", + "Gender": "Male", + "Locale": "te-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mohan Online (Natural) - Telugu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (te-IN, ShrutiNeural)", + "ShortName": "te-IN-ShrutiNeural", + "Gender": "Female", + "Locale": "te-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Shruti Online (Natural) - Telugu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (th-TH, NiwatNeural)", + "ShortName": "th-TH-NiwatNeural", + "Gender": "Male", + "Locale": "th-TH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Niwat Online (Natural) - Thai (Thailand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (th-TH, PremwadeeNeural)", + "ShortName": "th-TH-PremwadeeNeural", + "Gender": "Female", + "Locale": "th-TH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Premwadee Online (Natural) - Thai (Thailand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (tr-TR, EmelNeural)", + "ShortName": "tr-TR-EmelNeural", + "Gender": "Female", + "Locale": "tr-TR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emel Online (Natural) - Turkish (Turkey)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (tr-TR, AhmetNeural)", + "ShortName": "tr-TR-AhmetNeural", + "Gender": "Male", + "Locale": "tr-TR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ahmet Online (Natural) - Turkish (Türkiye)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uk-UA, OstapNeural)", + "ShortName": "uk-UA-OstapNeural", + "Gender": "Male", + "Locale": "uk-UA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ostap Online (Natural) - Ukrainian (Ukraine)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uk-UA, PolinaNeural)", + "ShortName": "uk-UA-PolinaNeural", + "Gender": "Female", + "Locale": "uk-UA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Polina Online (Natural) - Ukrainian (Ukraine)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-IN, GulNeural)", + "ShortName": "ur-IN-GulNeural", + "Gender": "Female", + "Locale": "ur-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gul Online (Natural) - Urdu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-IN, SalmanNeural)", + "ShortName": "ur-IN-SalmanNeural", + "Gender": "Male", + "Locale": "ur-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Salman Online (Natural) - Urdu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-PK, AsadNeural)", + "ShortName": "ur-PK-AsadNeural", + "Gender": "Male", + "Locale": "ur-PK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Asad Online (Natural) - Urdu (Pakistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-PK, UzmaNeural)", + "ShortName": "ur-PK-UzmaNeural", + "Gender": "Female", + "Locale": "ur-PK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Uzma Online (Natural) - Urdu (Pakistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uz-UZ, MadinaNeural)", + "ShortName": "uz-UZ-MadinaNeural", + "Gender": "Female", + "Locale": "uz-UZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Madina Online (Natural) - Uzbek (Uzbekistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uz-UZ, SardorNeural)", + "ShortName": "uz-UZ-SardorNeural", + "Gender": "Male", + "Locale": "uz-UZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sardor Online (Natural) - Uzbek (Uzbekistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (vi-VN, HoaiMyNeural)", + "ShortName": "vi-VN-HoaiMyNeural", + "Gender": "Female", + "Locale": "vi-VN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HoaiMy Online (Natural) - Vietnamese (Vietnam)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (vi-VN, NamMinhNeural)", + "ShortName": "vi-VN-NamMinhNeural", + "Gender": "Male", + "Locale": "vi-VN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft NamMinh Online (Natural) - Vietnamese (Vietnam)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cy-GB, AledNeural)", + "ShortName": "cy-GB-AledNeural", + "Gender": "Male", + "Locale": "cy-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aled Online (Natural) - Welsh (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cy-GB, NiaNeural)", + "ShortName": "cy-GB-NiaNeural", + "Gender": "Female", + "Locale": "cy-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nia Online (Natural) - Welsh (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zu-ZA, ThandoNeural)", + "ShortName": "zu-ZA-ThandoNeural", + "Gender": "Female", + "Locale": "zu-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thando Online (Natural) - Zulu (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zu-ZA, ThembaNeural)", + "ShortName": "zu-ZA-ThembaNeural", + "Gender": "Male", + "Locale": "zu-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Themba Online (Natural) - Zulu (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + } +] \ No newline at end of file diff --git a/rvc_logic/rvc/lib/utils.py b/rvc_logic/rvc/lib/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ec46e85f2b9574b4beff3d4d4fe22d6b9d3bbdee --- /dev/null +++ b/rvc_logic/rvc/lib/utils.py @@ -0,0 +1,194 @@ +import logging +import os +import pathlib +import re +import sys +import unicodedata +import warnings + +import soxr + +import wget + +import numpy as np + +from torch import nn +from transformers import HubertModel + +import librosa +import soundfile as sf + +from rvc_logic.common import RVC_MODELS_DIR + +# Remove this to see warnings about transformers models +warnings.filterwarnings("ignore") + +logging.getLogger("fairseq").setLevel(logging.ERROR) +logging.getLogger("faiss.loader").setLevel(logging.ERROR) +logging.getLogger("transformers").setLevel(logging.ERROR) +logging.getLogger("torch").setLevel(logging.ERROR) + +now_dir = pathlib.Path.cwd() +sys.path.append(str(now_dir)) + +base_path = os.path.join(str(RVC_MODELS_DIR), "formant", "stftpitchshift") +stft = base_path + ".exe" if sys.platform == "win32" else base_path + + +class HubertModelWithFinalProj(HubertModel): + def __init__(self, config): + super().__init__(config) + self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size) + + +def load_audio_16k(file): + # this is used by f0 and feature extractions that load preprocessed 16k files, so there's no need to resample + try: + audio, sr = librosa.load(file, sr=16000) + except Exception as error: + raise RuntimeError(f"An error occurred loading the audio: {error}") + + return audio.flatten() + + +def load_audio(file, sample_rate): + try: + file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + audio, sr = sf.read(file) + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.T) + if sr != sample_rate: + audio = librosa.resample( + audio, + orig_sr=sr, + target_sr=sample_rate, + res_type="soxr_vhq", + ) + except Exception as error: + raise RuntimeError(f"An error occurred loading the audio: {error}") + + return audio.flatten() + + +def load_audio_infer( + file, + sample_rate, + **kwargs, +): + formant_shifting = kwargs.get("formant_shifting", False) + try: + file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + if not pathlib.Path(file).is_file(): + raise FileNotFoundError(f"File not found: {file}") + audio, sr = sf.read(file) + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.T) + if sr != sample_rate: + audio = librosa.resample( + audio, + orig_sr=sr, + target_sr=sample_rate, + res_type="soxr_vhq", + ) + if formant_shifting: + formant_qfrency = kwargs.get("formant_qfrency", 0.8) + formant_timbre = kwargs.get("formant_timbre", 0.8) + + from stftpitchshift import StftPitchShift + + pitchshifter = StftPitchShift(1024, 32, sample_rate) + audio = pitchshifter.shiftpitch( + audio, + factors=1, + quefrency=formant_qfrency * 1e-3, + distortion=formant_timbre, + ) + except Exception as error: + raise RuntimeError(f"An error occurred loading the audio: {error}") + return np.array(audio).flatten() + + +def format_title(title): + formatted_title = unicodedata.normalize("NFC", title) + formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title) + formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title, flags=re.UNICODE) + formatted_title = re.sub(r"\s+", "_", formatted_title) + return formatted_title + + +def load_embedding(embedder_model, custom_embedder=None): + embedder_root = os.path.join(str(RVC_MODELS_DIR), "embedders") + embedding_list = { + "contentvec": os.path.join(embedder_root, "contentvec"), + "spin": os.path.join(embedder_root, "spin"), + "spin-v2": os.path.join(embedder_root, "spin-v2"), + "chinese-hubert-base": os.path.join(embedder_root, "chinese_hubert_base"), + "japanese-hubert-base": os.path.join(embedder_root, "japanese_hubert_base"), + "korean-hubert-base": os.path.join(embedder_root, "korean_hubert_base"), + } + + online_embedders = { + "contentvec": ( + "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/contentvec/pytorch_model.bin" + ), + "spin": ( + "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/spin/pytorch_model.bin" + ), + "spin-v2": ( + "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/spin-v2/pytorch_model.bin" + ), + "chinese-hubert-base": ( + "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/chinese_hubert_base/pytorch_model.bin" + ), + "japanese-hubert-base": ( + "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/japanese_hubert_base/pytorch_model.bin" + ), + "korean-hubert-base": ( + "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/korean_hubert_base/pytorch_model.bin" + ), + } + + config_files = { + "contentvec": ( + "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/contentvec/config.json" + ), + "spin": ( + "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/spin/config.json" + ), + "spin-v2": ( + "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/spin-v2/config.json" + ), + "chinese-hubert-base": ( + "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/chinese_hubert_base/config.json" + ), + "japanese-hubert-base": ( + "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/japanese_hubert_base/config.json" + ), + "korean-hubert-base": ( + "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/korean_hubert_base/config.json" + ), + } + + if embedder_model == "custom": + if pathlib.Path(custom_embedder).exists(): + model_path = custom_embedder + else: + print(f"Custom embedder not found: {custom_embedder}, using contentvec") + model_path = embedding_list["contentvec"] + else: + model_path = embedding_list[embedder_model] + bin_file = os.path.join(model_path, "pytorch_model.bin") + json_file = os.path.join(model_path, "config.json") + pathlib.Path(model_path).mkdir(exist_ok=True, parents=True) + if not pathlib.Path(bin_file).exists(): + url = online_embedders[embedder_model] + print(f"Downloading {url} to {model_path}...") + wget.download(url, out=bin_file) + if not pathlib.Path(json_file).exists(): + url = config_files[embedder_model] + print(f"Downloading {url} to {model_path}...") + wget.download(url, out=json_file) + + models = HubertModelWithFinalProj.from_pretrained(model_path) + return models +s diff --git a/rvc_logic/rvc/lib/zluda.py b/rvc_logic/rvc/lib/zluda.py new file mode 100644 index 0000000000000000000000000000000000000000..8ad8e9e9a9ae3589ba22fecc834b367902078313 --- /dev/null +++ b/rvc_logic/rvc/lib/zluda.py @@ -0,0 +1,85 @@ +import torch + +if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"): + + class STFT: + def __init__(self): + self.device = "cuda" + self.fourier_bases = {} # Cache for Fourier bases + + def _get_fourier_basis(self, n_fft): + # Check if the basis for this n_fft is already cached + if n_fft in self.fourier_bases: + return self.fourier_bases[n_fft] + fourier_basis = torch.fft.fft(torch.eye(n_fft, device="cpu")).to( + self.device, + ) + # stack separated real and imaginary components and convert to torch tensor + cutoff = n_fft // 2 + 1 + fourier_basis = torch.cat( + [fourier_basis.real[:cutoff], fourier_basis.imag[:cutoff]], + dim=0, + ) + # cache the tensor and return + self.fourier_bases[n_fft] = fourier_basis + return fourier_basis + + def transform(self, input, n_fft, hop_length, window): + # fetch cached Fourier basis + fourier_basis = self._get_fourier_basis(n_fft) + # apply hann window to Fourier basis + fourier_basis = fourier_basis * window + # pad input to center with reflect + pad_amount = n_fft // 2 + input = torch.nn.functional.pad( + input, + (pad_amount, pad_amount), + mode="reflect", + ) + # separate input into n_fft-sized frames + input_frames = input.unfold(1, n_fft, hop_length).permute(0, 2, 1) + # apply fft to each frame + fourier_transform = torch.matmul(fourier_basis, input_frames) + cutoff = n_fft // 2 + 1 + return torch.complex( + fourier_transform[:, :cutoff, :], + fourier_transform[:, cutoff:, :], + ) + + stft = STFT() + _torch_stft = torch.stft + + def z_stft(input: torch.Tensor, window: torch.Tensor, *args, **kwargs): + # only optimizing a specific call from rvc.train.mel_processing.MultiScaleMelSpectrogramLoss + if ( + kwargs.get("win_length") == None + and kwargs.get("center") == None + and kwargs.get("return_complex") == True + ): + # use GPU accelerated calculation + return stft.transform( + input, + kwargs.get("n_fft"), + kwargs.get("hop_length"), + window, + ) + # simply do the operation on CPU + return _torch_stft( + input=input.cpu(), + window=window.cpu(), + *args, + **kwargs, + ).to(input.device) + + def z_jit(f, *_, **__): + f.graph = torch._C.Graph() + return f + + # hijacks + torch.stft = z_stft + torch.jit.script = z_jit + # disabling unsupported cudnn + torch.backends.cudnn.enabled = False + torch.backends.cuda.enable_flash_sdp(False) + torch.backends.cuda.enable_math_sdp(True) + torch.backends.cuda.enable_mem_efficient_sdp(False) diff --git a/rvc_logic/rvc/train/anyprecision_optimizer.py b/rvc_logic/rvc/train/anyprecision_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..e58a2a0ea7cb81899c24d3d68b55411466f947f7 --- /dev/null +++ b/rvc_logic/rvc/train/anyprecision_optimizer.py @@ -0,0 +1,185 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# AnyPrecisionAdamW: a flexible precision AdamW optimizer +# with optional Kahan summation for high precision weight updates. +# Allows direct control over momentum, variance and auxiliary compensation +# buffer dtypes. +# Optional Kahan summation is used to offset precision reduction for +# the weight updates. This allows full training in BFloat16 (equal or +# better than FP32 results in many cases) due to high precision weight updates. + +import torch +from torch.optim.optimizer import Optimizer + + +class AnyPrecisionAdamW(Optimizer): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=0.0, + use_kahan_summation=True, # NOTE default upstream is True + momentum_dtype=torch.bfloat16, # NOTE default upstream is torch.float32, + variance_dtype=torch.bfloat16, + compensation_buffer_dtype=torch.bfloat16, + ): + """ + Args: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay coefficient (default: 1e-2) + + # Any Precision specific + use_kahan_summation = creates auxiliary buffer to ensure high precision + model param updates (default: True) + momentum_dtype = dtype for momentum (default: torch.bfloat16) + variance_dtype = dtype for uncentered variance (default: torch.bfloat16) + compensation_buffer_dtype = dtype for Kahan summation + buffer (default: torch.bfloat16). Only used if + ``use_kahan_summation=True``. + + # Usage + This optimizer implements optimizer states, and Kahan summation + for high precision updates, all in user controlled dtypes. + Defaults are variance in BF16, Momentum in BF16. + This can be run in FSDP mixed precision, amp, or full precision, + depending on what training pipeline you wish to work with. + + Setting to use_kahan_summation = False, and changing momentum and + variance dtypes to FP32, reverts this to a standard AdamW optimizer. + + """ + defaults = dict( + lr=lr, + betas=betas, + eps=eps, + weight_decay=weight_decay, + use_kahan_summation=use_kahan_summation, + momentum_dtype=momentum_dtype, + variance_dtype=variance_dtype, + compensation_buffer_dtype=compensation_buffer_dtype, + ) + + super().__init__(params, defaults) + + @torch.no_grad() + def step(self, closure=None): + """ + Performs a single optimization step. + + Args: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + + """ + if closure is not None: + with torch.enable_grad(): + # to fix linter, we do not keep the returned loss for use atm. + closure() + + for group in self.param_groups: + + beta1, beta2 = group["betas"] + lr = group["lr"] + weight_decay = group["weight_decay"] + eps = group["eps"] + use_kahan_summation = group["use_kahan_summation"] + + momentum_dtype = group["momentum_dtype"] + variance_dtype = group["variance_dtype"] + compensation_buffer_dtype = group["compensation_buffer_dtype"] + + for p in group["params"]: + if p.grad is None: + continue + + if p.grad.is_sparse: + raise RuntimeError( + "AnyPrecisionAdamW does not support sparse gradients" + ) + + state = self.state[p] + + # State initialization + if len(state) == 0: + + state["step"] = torch.tensor(0.0) + + # momentum - EMA of gradient values + state["exp_avg"] = torch.zeros_like( + p, + dtype=momentum_dtype, + ) + + # variance uncentered - EMA of squared gradient values + state["exp_avg_sq"] = torch.zeros_like( + p, + dtype=variance_dtype, + ) + + # optional Kahan summation - accumulated error tracker + if use_kahan_summation: + state["compensation"] = torch.zeros_like( + p, + dtype=compensation_buffer_dtype, + ) + + # main processing ------------------------- + + # update the steps for each param group update + state["step"] += 1 + step = state["step"] + + exp_avg = state["exp_avg"] + exp_avg_sq = state["exp_avg_sq"] + + grad = p.grad + + # weight decay, AdamW style + if weight_decay: + p.data.mul_(1 - lr * weight_decay) + + # update momentum + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + + # update uncentered variance + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) + + # adjust using bias1 + bias_correction1 = 1 - beta1**step + + step_size = lr / bias_correction1 + + # adjust using bias2 + denom_correction = (1 - beta2**step) ** 0.5 # avoids math import + + centered_variance = (exp_avg_sq.sqrt() / denom_correction).add_( + eps, alpha=1 + ) + + # lr update to compensation + if use_kahan_summation: + compensation = state["compensation"] + + compensation.addcdiv_(exp_avg, centered_variance, value=-step_size) + + # update weights with compensation (Kahan summation) + # save error back to compensation for next iteration + temp_buffer = p.detach().clone() + p.data.add_(compensation) + compensation.add_(temp_buffer.sub_(p.data)) + + else: + # usual AdamW updates + p.data.addcdiv_(exp_avg, centered_variance, value=-step_size) diff --git a/rvc_logic/rvc/train/data_utils.py b/rvc_logic/rvc/train/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ab1bc37787e55d125bb661d2f9449a4ec8be9e2c --- /dev/null +++ b/rvc_logic/rvc/train/data_utils.py @@ -0,0 +1,396 @@ +import pathlib + +import numpy as np + +import torch +import torch.utils.data + +from rvc_logic.rvc.train.mel_processing import spectrogram_torch +from rvc_logic.rvc.train.utils import load_filepaths_and_text, load_wav_to_torch + + +class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): + """ + Dataset that loads text and audio pairs. + + Args: + hparams: Hyperparameters. + + """ + + def __init__(self, hparams): + self.audiopaths_and_text = load_filepaths_and_text(hparams.training_files) + self.max_wav_value = hparams.max_wav_value + self.sample_rate = hparams.sample_rate + self.filter_length = hparams.filter_length + self.hop_length = hparams.hop_length + self.win_length = hparams.win_length + self.sample_rate = hparams.sample_rate + self.min_text_len = getattr(hparams, "min_text_len", 1) + self.max_text_len = getattr(hparams, "max_text_len", 5000) + self._filter() + + def _filter(self): + """ + Filters audio paths and text pairs based on text length. + """ + audiopaths_and_text_new = [] + lengths = [] + for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text: + if self.min_text_len <= len(text) and len(text) <= self.max_text_len: + audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv]) + lengths.append( + pathlib.Path(audiopath).stat().st_size // (3 * self.hop_length) + ) + self.audiopaths_and_text = audiopaths_and_text_new + self.lengths = lengths + + def get_sid(self, sid): + """ + Converts speaker ID to a LongTensor. + + Args: + sid (str): Speaker ID. + + """ + try: + sid = torch.LongTensor([int(sid)]) + except ValueError as error: + print(f"Error converting speaker ID '{sid}' to integer. Exception: {error}") + sid = torch.LongTensor([0]) + return sid + + def get_audio_text_pair(self, audiopath_and_text): + """ + Loads and processes audio and text data for a single pair. + + Args: + audiopath_and_text (list): List containing audio path, text, pitch, pitchf, and speaker ID. + + """ + file = audiopath_and_text[0] + phone = audiopath_and_text[1] + pitch = audiopath_and_text[2] + pitchf = audiopath_and_text[3] + dv = audiopath_and_text[4] + + phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf) + spec, wav = self.get_audio(file) + dv = self.get_sid(dv) + + len_phone = phone.size()[0] + len_spec = spec.size()[-1] + if len_phone != len_spec: + len_min = min(len_phone, len_spec) + len_wav = len_min * self.hop_length + + spec = spec[:, :len_min] + wav = wav[:, :len_wav] + + phone = phone[:len_min, :] + pitch = pitch[:len_min] + pitchf = pitchf[:len_min] + + return (spec, wav, phone, pitch, pitchf, dv) + + def get_labels(self, phone, pitch, pitchf): + """ + Loads and processes phoneme, pitch, and pitchf labels. + + Args: + phone (str): Path to phoneme label file. + pitch (str): Path to pitch label file. + pitchf (str): Path to pitchf label file. + + """ + phone = np.load(phone) + phone = np.repeat(phone, 2, axis=0) + pitch = np.load(pitch) + pitchf = np.load(pitchf) + n_num = min(phone.shape[0], 900) + phone = phone[:n_num, :] + pitch = pitch[:n_num] + pitchf = pitchf[:n_num] + phone = torch.FloatTensor(phone) + pitch = torch.LongTensor(pitch) + pitchf = torch.FloatTensor(pitchf) + return phone, pitch, pitchf + + def get_audio(self, filename): + """ + Loads and processes audio data. + + Args: + filename (str): Path to audio file. + + """ + audio, sample_rate = load_wav_to_torch(filename) + if sample_rate != self.sample_rate: + raise ValueError( + f"{sample_rate} SR doesn't match target {self.sample_rate} SR", + ) + audio_norm = audio + audio_norm = audio_norm.unsqueeze(0) + spec_filename = filename.replace(".wav", ".spec.pt") + if pathlib.Path(spec_filename).exists(): + try: + spec = torch.load(spec_filename, weights_only=False) + except Exception as error: + print(f"An error occurred getting spec from {spec_filename}: {error}") + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + else: + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + return spec, audio_norm + + def __getitem__(self, index): + """ + Returns a single audio-text pair. + + Args: + index (int): Index of the data sample. + + """ + return self.get_audio_text_pair(self.audiopaths_and_text[index]) + + def __len__(self): + """ + Returns the length of the dataset. + """ + return len(self.audiopaths_and_text) + + +class TextAudioCollateMultiNSFsid: + """ + Collates text and audio data for training. + + Args: + return_ids (bool, optional): Whether to return sample IDs. Defaults to False. + + """ + + def __init__(self, return_ids=False): + self.return_ids = return_ids + + def __call__(self, batch): + """ + Collates a batch of data samples. + + Args: + batch (list): List of data samples. + + """ + _, ids_sorted_decreasing = torch.sort( + torch.LongTensor([x[0].size(1) for x in batch]), + dim=0, + descending=True, + ) + + max_spec_len = max([x[0].size(1) for x in batch]) + max_wave_len = max([x[1].size(1) for x in batch]) + spec_lengths = torch.LongTensor(len(batch)) + wave_lengths = torch.LongTensor(len(batch)) + spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) + wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) + spec_padded.zero_() + wave_padded.zero_() + + max_phone_len = max([x[2].size(0) for x in batch]) + phone_lengths = torch.LongTensor(len(batch)) + phone_padded = torch.FloatTensor( + len(batch), + max_phone_len, + batch[0][2].shape[1], + ) + pitch_padded = torch.LongTensor(len(batch), max_phone_len) + pitchf_padded = torch.FloatTensor(len(batch), max_phone_len) + phone_padded.zero_() + pitch_padded.zero_() + pitchf_padded.zero_() + sid = torch.LongTensor(len(batch)) + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + spec = row[0] + spec_padded[i, :, : spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + wave = row[1] + wave_padded[i, :, : wave.size(1)] = wave + wave_lengths[i] = wave.size(1) + + phone = row[2] + phone_padded[i, : phone.size(0), :] = phone + phone_lengths[i] = phone.size(0) + + pitch = row[3] + pitch_padded[i, : pitch.size(0)] = pitch + pitchf = row[4] + pitchf_padded[i, : pitchf.size(0)] = pitchf + + sid[i] = row[5] + + return ( + phone_padded, + phone_lengths, + pitch_padded, + pitchf_padded, + spec_padded, + spec_lengths, + wave_padded, + wave_lengths, + sid, + ) + + +class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): + """ + Distributed sampler that groups data into buckets based on length. + + Args: + dataset (torch.utils.data.Dataset): Dataset to sample from. + batch_size (int): Batch size. + boundaries (list): List of length boundaries for buckets. + num_replicas (int, optional): Number of processes participating in distributed training. Defaults to None. + rank (int, optional): Rank of the current process. Defaults to None. + shuffle (bool, optional): Whether to shuffle the data. Defaults to True. + + """ + + def __init__( + self, + dataset, + batch_size, + boundaries, + num_replicas=None, + rank=None, + shuffle=True, + ): + super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) + self.lengths = dataset.lengths + self.batch_size = batch_size + self.boundaries = boundaries + + self.buckets, self.num_samples_per_bucket = self._create_buckets() + self.total_size = sum(self.num_samples_per_bucket) + self.num_samples = self.total_size // self.num_replicas + + def _create_buckets(self): + """ + Creates buckets of data samples based on length. + """ + buckets = [[] for _ in range(len(self.boundaries) - 1)] + for i in range(len(self.lengths)): + length = self.lengths[i] + idx_bucket = self._bisect(length) + if idx_bucket != -1: + buckets[idx_bucket].append(i) + + for i in range(len(buckets) - 1, -1, -1): + if len(buckets[i]) == 0: + buckets.pop(i) + self.boundaries.pop(i + 1) + + num_samples_per_bucket = [] + for i in range(len(buckets)): + len_bucket = len(buckets[i]) + total_batch_size = self.num_replicas * self.batch_size + rem = ( + total_batch_size - (len_bucket % total_batch_size) + ) % total_batch_size + num_samples_per_bucket.append(len_bucket + rem) + return buckets, num_samples_per_bucket + + def __iter__(self): + """ + Iterates over batches of data samples. + """ + g = torch.Generator() + g.manual_seed(self.epoch) + + indices = [] + if self.shuffle: + for bucket in self.buckets: + indices.append(torch.randperm(len(bucket), generator=g).tolist()) + else: + for bucket in self.buckets: + indices.append(list(range(len(bucket)))) + + batches = [] + for i in range(len(self.buckets)): + bucket = self.buckets[i] + len_bucket = len(bucket) + ids_bucket = indices[i] + num_samples_bucket = self.num_samples_per_bucket[i] + + rem = num_samples_bucket - len_bucket + ids_bucket = ( + ids_bucket + + ids_bucket * (rem // len_bucket) + + ids_bucket[: (rem % len_bucket)] + ) + + ids_bucket = ids_bucket[self.rank :: self.num_replicas] + + # batching + for j in range(len(ids_bucket) // self.batch_size): + batch = [ + bucket[idx] + for idx in ids_bucket[ + j * self.batch_size : (j + 1) * self.batch_size + ] + ] + batches.append(batch) + + if self.shuffle: + batch_ids = torch.randperm(len(batches), generator=g).tolist() + batches = [batches[i] for i in batch_ids] + self.batches = batches + + assert len(self.batches) * self.batch_size == self.num_samples + return iter(self.batches) + + def _bisect(self, x, lo=0, hi=None): + """ + Performs binary search to find the bucket index for a given length. + + Args: + x (int): Length to find the bucket for. + lo (int, optional): Lower bound of the search range. Defaults to 0. + hi (int, optional): Upper bound of the search range. Defaults to None. + + """ + if hi is None: + hi = len(self.boundaries) - 1 + + if hi > lo: + mid = (hi + lo) // 2 + if self.boundaries[mid] < x <= self.boundaries[mid + 1]: + return mid + if x <= self.boundaries[mid]: + return self._bisect(x, lo, mid) + return self._bisect(x, mid + 1, hi) + return -1 + + def __len__(self): + """ + Returns the length of the sampler. + """ + return self.num_samples // self.batch_size +size diff --git a/rvc_logic/rvc/train/extract/extract.py b/rvc_logic/rvc/train/extract/extract.py new file mode 100644 index 0000000000000000000000000000000000000000..9b9d5c5bcb3ea2042c915d86f384fdb7ecf61c07 --- /dev/null +++ b/rvc_logic/rvc/train/extract/extract.py @@ -0,0 +1,261 @@ +import concurrent.futures +import glob +import json +import logging +import multiprocessing as mp +import os +import pathlib +import sys +import time + +import numpy as np +import tqdm + +import torch + +now_dir = pathlib.Path.cwd() +sys.path.append(os.path.join(now_dir)) + +# Zluda hijack +import rvc_logic.rvc.lib.zluda +from rvc_logic.common import RVC_MODELS_DIR +from rvc_logic.rvc.configs.config import Config +from rvc_logic.rvc.lib.predictors.f0 import CREPE, FCPE, RMVPE +from rvc_logic.rvc.lib.utils import load_audio_16k, load_embedding +from rvc_logic.rvc.train.utils import remove_sox_libmso6_from_ld_preload + +logger = logging.getLogger(__name__) + +# Load config +config = Config() +mp.set_start_method("spawn", force=True) + + +class FeatureInput: + def __init__(self, f0_method="rmvpe", device="cpu"): + self.hop_size = 160 # default + self.sample_rate = 16000 # default + self.f0_bin = 256 + self.f0_max = 1100.0 + self.f0_min = 50.0 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + self.device = device + if f0_method in {"crepe", "crepe-tiny"}: + self.model = CREPE( + device=self.device, sample_rate=self.sample_rate, hop_size=self.hop_size + ) + elif f0_method == "rmvpe": + self.model = RMVPE( + device=self.device, sample_rate=self.sample_rate, hop_size=self.hop_size + ) + elif f0_method == "fcpe": + self.model = FCPE( + device=self.device, sample_rate=self.sample_rate, hop_size=self.hop_size + ) + self.f0_method = f0_method + + def compute_f0(self, x, p_len=None): + if self.f0_method == "crepe": + f0 = self.model.get_f0(x, self.f0_min, self.f0_max, p_len, "full") + elif self.f0_method == "crepe-tiny": + f0 = self.model.get_f0(x, self.f0_min, self.f0_max, p_len, "tiny") + elif self.f0_method == "rmvpe": + f0 = self.model.get_f0(x, filter_radius=0.03) + elif self.f0_method == "fcpe": + f0 = self.model.get_f0(x, p_len, filter_radius=0.006) + return f0 + + def coarse_f0(self, f0): + f0_mel = 1127.0 * np.log(1.0 + f0 / 700.0) + f0_mel = np.clip( + (f0_mel - self.f0_mel_min) + * (self.f0_bin - 2) + / (self.f0_mel_max - self.f0_mel_min) + + 1, + 1, + self.f0_bin - 1, + ) + return np.rint(f0_mel).astype(int) + + def process_file(self, file_info): + inp_path, opt_path_coarse, opt_path_full, _ = file_info + if ( + pathlib.Path(opt_path_coarse).exists() + and pathlib.Path(opt_path_full).exists() + ): + return + + try: + np_arr = load_audio_16k(inp_path) + feature_pit = self.compute_f0(np_arr) + np.save(opt_path_full, feature_pit, allow_pickle=False) + coarse_pit = self.coarse_f0(feature_pit) + np.save(opt_path_coarse, coarse_pit, allow_pickle=False) + except Exception as error: + logger.error( # noqa: TRY400 + "An error occurred extracting file %s on %s: %s", + inp_path, + self.device, + error, + ) + + +def process_files(files, f0_method, device): + fe = FeatureInput(f0_method=f0_method, device=device) + with tqdm.tqdm(total=len(files), leave=True) as pbar: + for file_info in files: + fe.process_file(file_info) + pbar.update(1) + + +def run_pitch_extraction( + files: list[list[str]], + devices: list[str], + f0_method: str, + threads: int, +) -> None: + devices_str = ", ".join(devices) + logger.info( + "Starting pitch extraction with %d cores on %s using %s...", + threads, + devices_str, + f0_method, + ) + start_time = time.time() + remove_sox_libmso6_from_ld_preload() + + with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor: + tasks = [ + executor.submit( + process_files, + files[i :: len(devices)], + f0_method, + devices[i], + ) + for i in range(len(devices)) + ] + for future in concurrent.futures.as_completed(tasks): + future.result() # Properly waits and propagates exceptions + + logger.info("Pitch extraction completed in %.2f seconds.", time.time() - start_time) + + +def process_file_embedding( + files, + embedder_model, + embedder_model_custom, + device_num, + device, + n_threads, +): + model = load_embedding(embedder_model, embedder_model_custom).to(device).float() + model.eval() + n_threads = max(1, n_threads) + + def worker(file_info): + wav_file_path, _, _, out_file_path = file_info + if pathlib.Path(out_file_path).exists(): + return + feats = torch.from_numpy(load_audio_16k(wav_file_path)).to(device).float() + feats = feats.view(1, -1) + with torch.no_grad(): + result = model(feats)["last_hidden_state"] + feats_out = result.squeeze(0).float().cpu().numpy() + if not np.isnan(feats_out).any(): + np.save(out_file_path, feats_out, allow_pickle=False) + else: + logger.error("%s contains NaN values and will be skipped.", wav_file_path) + + with tqdm.tqdm(total=len(files), leave=True, position=device_num) as pbar: + with concurrent.futures.ThreadPoolExecutor(max_workers=n_threads) as executor: + futures = [executor.submit(worker, f) for f in files] + for _ in concurrent.futures.as_completed(futures): + pbar.update(1) + + +def run_embedding_extraction( + files: list[list[str]], + devices: list[str], + embedder_model: str, + embedder_model_custom: str | None, + threads: int, +) -> None: + devices_str = ", ".join(devices) + logger.info( + "Starting embedding extraction with %d cores on %s...", + threads, + devices_str, + ) + start_time = time.time() + with concurrent.futures.ProcessPoolExecutor(max_workers=len(devices)) as executor: + tasks = [ + executor.submit( + process_file_embedding, + files[i :: len(devices)], + embedder_model, + embedder_model_custom, + i, + devices[i], + threads // len(devices), + ) + for i in range(len(devices)) + ] + for future in concurrent.futures.as_completed(tasks): + future.result() # Properly waits and propagates exceptions + logger.info( + "Embedding extraction completed in %.2f seconds.", + time.time() - start_time, + ) + + +def initialize_extraction( + exp_dir: str, + f0_method: str, + embedder_model: str, +) -> list[list[str]]: + wav_path = os.path.join(exp_dir, "sliced_audios_16k") + pathlib.Path(os.path.join(exp_dir, f"f0_{f0_method}")).mkdir( + exist_ok=True, parents=True + ) + pathlib.Path(os.path.join(exp_dir, f"f0_{f0_method}_voiced")).mkdir( + exist_ok=True, parents=True + ) + pathlib.Path(os.path.join(exp_dir, f"{embedder_model}_extracted")).mkdir( + exist_ok=True, parents=True + ) + + files: list[list[str]] = [] + for file in glob.glob(os.path.join(wav_path, "*.wav")): + file_name = os.path.basename(file) + file_info = [ + file, + os.path.join(exp_dir, f"f0_{f0_method}", file_name + ".npy"), + os.path.join(exp_dir, f"f0_{f0_method}_voiced", file_name + ".npy"), + os.path.join( + exp_dir, + f"{embedder_model}_extracted", + file_name.replace("wav", "npy"), + ), + ] + files.append(file_info) + + return files + + +def update_model_info( + exp_dir: str, + embedder_model: str, + custom_embedder_model_hash: str | None, +) -> None: + file_path = os.path.join(exp_dir, "model_info.json") + if pathlib.Path(file_path).exists(): + with pathlib.Path(file_path).open() as f: + data = json.load(f) + else: + data = {} + data["embedder_model"] = embedder_model + data["custom_embedder_model_hash"] = custom_embedder_model_hash + with pathlib.Path(file_path).open("w") as f: + json.dump(data, f, indent=4) +ta, f, indent=4) diff --git a/rvc_logic/rvc/train/extract/preparing_files.py b/rvc_logic/rvc/train/extract/preparing_files.py new file mode 100644 index 0000000000000000000000000000000000000000..55015cd7d5c17ff00ee96d590cff0a4a1f41c8be --- /dev/null +++ b/rvc_logic/rvc/train/extract/preparing_files.py @@ -0,0 +1,115 @@ +import json +import os +import pathlib +import shutil +from random import shuffle + +from rvc_logic.rvc.common import RVC_CONFIGS_DIR, RVC_TRAINING_MODELS_DIR +from rvc_logic.rvc.configs.config import Config + +config = Config() +current_directory = pathlib.Path.cwd() + + +def generate_config(model_path: str, sample_rate: int | None = None): + file_path = os.path.join(model_path, "model_info.json") + if pathlib.Path(file_path).exists(): + with pathlib.Path(file_path).open() as f: + data = json.load(f) + else: + data = {} + sample_rate = data.get("sample_rate") if sample_rate is None else sample_rate + if sample_rate is None: + raise ValueError( + "Sample rate must be provided either as argument or in model_info.json" + ) + config_path = os.path.join(RVC_CONFIGS_DIR, f"{sample_rate}.json") + config_save_path = os.path.join(model_path, "config.json") + shutil.copyfile(config_path, config_save_path) + + +def generate_filelist( + model_path: str, + include_mutes: int, + f0_method_id: str, + embedder_model_id: str, + sample_rate: int | None = None, +): + file_path = os.path.join(model_path, "model_info.json") + + if pathlib.Path(file_path).exists(): + with pathlib.Path(file_path).open() as f: + data = json.load(f) + else: + data = {} + sample_rate = data.get("sample_rate") if sample_rate is None else sample_rate + if sample_rate is None: + raise ValueError( + "Sample rate must be provided either as argument or in model_info.json" + ) + + gt_wavs_dir = os.path.join(model_path, "sliced_audios") + feature_dir = os.path.join( + model_path, + f"{embedder_model_id}_extracted", + ) + + f0_dir, f0nsf_dir = None, None + f0_dir = os.path.join(model_path, f"f0_{f0_method_id}") + f0nsf_dir = os.path.join(model_path, f"f0_{f0_method_id}_voiced") + + gt_wavs_files = set(name.split(".")[0] for name in os.listdir(gt_wavs_dir)) + feature_files = set(name.split(".")[0] for name in os.listdir(feature_dir)) + + f0_files = set(name.split(".")[0] for name in os.listdir(f0_dir)) + f0nsf_files = set(name.split(".")[0] for name in os.listdir(f0nsf_dir)) + names = gt_wavs_files & feature_files & f0_files & f0nsf_files + + options = [] + if embedder_model_id == "spin": + mute_base_path = os.path.join(RVC_TRAINING_MODELS_DIR, "mute_spin") + elif embedder_model_id == "spin-v2": + mute_base_path = os.path.join(RVC_TRAINING_MODELS_DIR, "mute_spin-v2") + else: + mute_base_path = os.path.join(RVC_TRAINING_MODELS_DIR, "mute") + sids = [] + for name in names: + sid = name.split("_")[0] + if sid not in sids: + sids.append(sid) + options.append( + f"{os.path.join(gt_wavs_dir, name)}.wav|{os.path.join(feature_dir, name)}.npy|{os.path.join(f0_dir, name)}.wav.npy|{os.path.join(f0nsf_dir, name)}.wav.npy|{sid}", + ) + if include_mutes > 0: + mute_audio_path = os.path.join( + mute_base_path, + "sliced_audios", + f"mute{sample_rate}.wav", + ) + mute_feature_path = os.path.join( + mute_base_path, + "extracted", + "mute.npy", + ) + mute_f0_path = os.path.join(mute_base_path, "f0", "mute.wav.npy") + mute_f0nsf_path = os.path.join(mute_base_path, "f0_voiced", "mute.wav.npy") + + # adding x files per sid + for sid in sids * include_mutes: + options.append( + f"{mute_audio_path}|{mute_feature_path}|{mute_f0_path}|{mute_f0nsf_path}|{sid}", + ) + + data.update( + { + "speakers_id": len(sids), + }, + ) + with pathlib.Path(file_path).open("w") as f: + json.dump(data, f, indent=4) + + shuffle(options) + + with pathlib.Path(os.path.join(model_path, "filelist.txt")).open("w") as f: + f.write("\n".join(options)) +ns)) diff --git a/rvc_logic/rvc/train/losses.py b/rvc_logic/rvc/train/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..ef3bce2506b77bb5f064569a2788abd5507e74a1 --- /dev/null +++ b/rvc_logic/rvc/train/losses.py @@ -0,0 +1,138 @@ +import torch + + +def feature_loss(fmap_r, fmap_g): + """ + Compute the feature loss between reference and generated feature maps. + + Args: + fmap_r (list of torch.Tensor): List of reference feature maps. + fmap_g (list of torch.Tensor): List of generated feature maps. + + """ + return 2 * sum( + torch.mean(torch.abs(rl - gl)) + for dr, dg in zip(fmap_r, fmap_g, strict=False) + for rl, gl in zip(dr, dg, strict=False) + ) + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + """ + Compute the discriminator loss for real and generated outputs. + + Args: + disc_real_outputs (list of torch.Tensor): List of discriminator outputs for real samples. + disc_generated_outputs (list of torch.Tensor): List of discriminator outputs for generated samples. + + """ + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs, strict=False): + r_loss = torch.mean((1 - dr.float()) ** 2) + g_loss = torch.mean(dg.float() ** 2) + + # r_losses.append(r_loss.item()) + # g_losses.append(g_loss.item()) + loss += r_loss + g_loss + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + """ + Compute the generator loss based on discriminator outputs. + + Args: + disc_outputs (list of torch.Tensor): List of discriminator outputs for generated samples. + + """ + loss = 0 + gen_losses = [] + for dg in disc_outputs: + l = torch.mean((1 - dg.float()) ** 2) + # gen_losses.append(l.item()) + loss += l + + return loss, gen_losses + + +def discriminator_loss_scaled(disc_real, disc_fake, scale=1.0): + loss = 0 + for i, (d_real, d_fake) in enumerate(zip(disc_real, disc_fake, strict=False)): + real_loss = torch.mean((1 - d_real) ** 2) + fake_loss = torch.mean(d_fake**2) + _loss = real_loss + fake_loss + loss += _loss if i < len(disc_real) / 2 else scale * _loss + return loss, None, None + + +def generator_loss_scaled(disc_outputs, scale=1.0): + loss = 0 + for i, d_fake in enumerate(disc_outputs): + d_fake = d_fake.float() + _loss = torch.mean((1 - d_fake) ** 2) + loss += _loss if i < len(disc_outputs) / 2 else scale * _loss + return loss, None, None + + +def discriminator_loss_scaled(disc_real, disc_fake, scale=1.0): + """ + Compute the scaled discriminator loss for real and generated outputs. + + Args: + disc_real (list of torch.Tensor): List of discriminator outputs for real samples. + disc_fake (list of torch.Tensor): List of discriminator outputs for generated samples. + scale (float, optional): Scaling factor applied to losses beyond the midpoint. Default is 1.0. + + """ + midpoint = len(disc_real) // 2 + losses = [] + for i, (d_real, d_fake) in enumerate(zip(disc_real, disc_fake, strict=False)): + real_loss = (1 - d_real).pow(2).mean() + fake_loss = d_fake.pow(2).mean() + total_loss = real_loss + fake_loss + if i >= midpoint: + total_loss *= scale + losses.append(total_loss) + loss = sum(losses) + return loss, None, None + + +def generator_loss_scaled(disc_outputs, scale=1.0): + """ + Compute the scaled generator loss based on discriminator outputs. + + Args: + disc_outputs (list of torch.Tensor): List of discriminator outputs for generated samples. + scale (float, optional): Scaling factor applied to losses beyond the midpoint. Default is 1.0. + + """ + midpoint = len(disc_outputs) // 2 + losses = [] + for i, d_fake in enumerate(disc_outputs): + loss_value = (1 - d_fake).pow(2).mean() + if i >= midpoint: + loss_value *= scale + losses.append(loss_value) + loss = sum(losses) + return loss, None, None + + +def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): + """ + Compute the Kullback-Leibler divergence loss. + + Args: + z_p (torch.Tensor): Latent variable z_p [b, h, t_t]. + logs_q (torch.Tensor): Log variance of q [b, h, t_t]. + m_p (torch.Tensor): Mean of p [b, h, t_t]. + logs_p (torch.Tensor): Log variance of p [b, h, t_t]. + z_mask (torch.Tensor): Mask for the latent variables [b, h, t_t]. + + """ + kl = logs_p - logs_q - 0.5 + 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2 * logs_p) + kl = (kl * z_mask).sum() + loss = kl / z_mask.sum() + return loss diff --git a/rvc_logic/rvc/train/mel_processing.py b/rvc_logic/rvc/train/mel_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..db1068f5b93f3a8c3c4f6bf9e298bf8627f771b7 --- /dev/null +++ b/rvc_logic/rvc/train/mel_processing.py @@ -0,0 +1,210 @@ +import torch +import torch.utils.data + +from librosa.filters import mel as librosa_mel_fn + +mel_basis = {} +hann_window = {} + + +def spectrogram_torch(y, n_fft, hop_size, win_size, center=False): + """ + Compute the spectrogram of a signal using STFT. + + Args: + y (torch.Tensor): Input signal. + n_fft (int): FFT window size. + hop_size (int): Hop size between frames. + win_size (int): Window size. + center (bool, optional): Whether to center the window. Defaults to False. + + """ + global hann_window + dtype_device = str(y.dtype) + "_" + str(y.device) + wnsize_dtype_device = str(win_size) + "_" + dtype_device + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( + dtype=y.dtype, + device=y.device, + ) + + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + + spec = torch.stft( + y, + n_fft=n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + + spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) + + return spec + + +def spec_to_mel_torch(spec, n_fft, num_mels, sample_rate, fmin, fmax): + """ + Convert a spectrogram to a mel-spectrogram. + + Args: + spec (torch.Tensor): Magnitude spectrogram. + n_fft (int): FFT window size. + num_mels (int): Number of mel frequency bins. + sample_rate (int): Sampling rate of the audio signal. + fmin (float): Minimum frequency. + fmax (float): Maximum frequency. + + """ + global mel_basis + dtype_device = str(spec.dtype) + "_" + str(spec.device) + fmax_dtype_device = str(fmax) + "_" + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn( + sr=sample_rate, + n_fft=n_fft, + n_mels=num_mels, + fmin=fmin, + fmax=fmax, + ) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( + dtype=spec.dtype, + device=spec.device, + ) + + melspec = torch.matmul(mel_basis[fmax_dtype_device], spec) + melspec = torch.log(melspec.clamp(min=1e-5) * 1) + return melspec + + +def mel_spectrogram_torch( + y, + n_fft, + num_mels, + sample_rate, + hop_size, + win_size, + fmin, + fmax, + center=False, +): + """ + Compute the mel-spectrogram of a signal. + + Args: + y (torch.Tensor): Input signal. + n_fft (int): FFT window size. + num_mels (int): Number of mel frequency bins. + sample_rate (int): Sampling rate of the audio signal. + hop_size (int): Hop size between frames. + win_size (int): Window size. + fmin (float): Minimum frequency. + fmax (float): Maximum frequency. + center (bool, optional): Whether to center the window. Defaults to False. + + """ + spec = spectrogram_torch(y, n_fft, hop_size, win_size, center) + + melspec = spec_to_mel_torch(spec, n_fft, num_mels, sample_rate, fmin, fmax) + + return melspec + + +def compute_window_length(n_mels: int, sample_rate: int): + f_min = 0 + f_max = sample_rate / 2 + window_length_seconds = 8 * n_mels / (f_max - f_min) + window_length = int(window_length_seconds * sample_rate) + return 2 ** (window_length.bit_length() - 1) + + +class MultiScaleMelSpectrogramLoss(torch.nn.Module): + + def __init__( + self, + sample_rate: int = 24000, + n_mels: list[int] = [5, 10, 20, 40, 80, 160, 320], # , 480], + window_lengths: list[int] = [32, 64, 128, 256, 512, 1024, 2048], # , 4096], + loss_fn=torch.nn.L1Loss(), + ): + super().__init__() + self.sample_rate = sample_rate + self.loss_fn = loss_fn + self.log_base = torch.log(torch.tensor(10.0)) + self.stft_params: list[tuple] = [] + self.hann_window: dict[int, torch.Tensor] = {} + self.mel_banks: dict[int, torch.Tensor] = {} + + self.stft_params = [(mel, win) for mel, win in zip(n_mels, window_lengths)] + + def mel_spectrogram( + self, + wav: torch.Tensor, + n_mels: int, + window_length: int, + ): + # IDs for caching + dtype_device = str(wav.dtype) + "_" + str(wav.device) + win_dtype_device = str(window_length) + "_" + dtype_device + mel_dtype_device = str(n_mels) + "_" + dtype_device + # caching hann window + if win_dtype_device not in self.hann_window: + self.hann_window[win_dtype_device] = torch.hann_window( + window_length, + device=wav.device, + dtype=torch.float32, + ) + + wav = wav.squeeze(1) # -> torch(B, T) + + stft = torch.stft( + wav.float(), + n_fft=window_length, + hop_length=window_length // 4, + window=self.hann_window[win_dtype_device], + return_complex=True, + ) # -> torch (B, window_length // 2 + 1, (T - window_length)/hop_length + 1) + + magnitude = torch.sqrt(stft.real.pow(2) + stft.imag.pow(2) + 1e-6) + + # caching mel filter + if mel_dtype_device not in self.mel_banks: + self.mel_banks[mel_dtype_device] = torch.from_numpy( + librosa_mel_fn( + sr=self.sample_rate, + n_mels=n_mels, + n_fft=window_length, + fmin=0, + fmax=None, + ), + ).to(device=wav.device, dtype=torch.float32) + + mel_spectrogram = torch.matmul( + self.mel_banks[mel_dtype_device], + magnitude, + ) # torch(B, n_mels, stft.frames) + return mel_spectrogram + + def forward( + self, + real: torch.Tensor, + fake: torch.Tensor, + ): # real: torch(B, 1, T) , fake: torch(B, 1, T) + loss = 0.0 + for p in self.stft_params: + real_mels = self.mel_spectrogram(real, *p) + fake_mels = self.mel_spectrogram(fake, *p) + real_logmels = torch.log(real_mels.clamp(min=1e-5)) / self.log_base + fake_logmels = torch.log(fake_mels.clamp(min=1e-5)) / self.log_base + loss += self.loss_fn(real_logmels, fake_logmels) + return loss diff --git a/rvc_logic/rvc/train/preprocess/preprocess.py b/rvc_logic/rvc/train/preprocess/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..fbd08ab875b7399e493da944a6c56a3c93a00089 --- /dev/null +++ b/rvc_logic/rvc/train/preprocess/preprocess.py @@ -0,0 +1,436 @@ +from typing import TYPE_CHECKING + +import concurrent.futures +import hashlib +import json +import os +import pathlib +import shutil +import sys +import time + +import soxr + +import numpy as np +from scipy import signal +from scipy.io import wavfile +from tqdm import tqdm + +import librosa +import noisereduce as nr + +now_directory = pathlib.Path.cwd() +sys.path.append(str(now_directory)) + +import lazy_loader as lazy + +import logging + +from rvc_logic.rvc.lib.utils import load_audio +from rvc_logic.rvc.train.preprocess.slicer import Slicer +from rvc_logic.rvc.train.utils import remove_sox_libmso6_from_ld_preload +from rvc_logic.typing_extra import AudioExt + +if TYPE_CHECKING: + import ffmpeg + import static_ffmpeg +else: + static_ffmpeg = lazy.load("static_ffmpeg") + ffmpeg = lazy.load("ffmpeg") +logging.getLogger("numba.core.byteflow").setLevel(logging.WARNING) +logging.getLogger("numba.core.ssa").setLevel(logging.WARNING) +logging.getLogger("numba.core.interpreter").setLevel(logging.WARNING) + +logger = logging.getLogger(__name__) + +OVERLAP = 0.3 +PERCENTAGE = 3.0 +MAX_AMPLITUDE = 0.9 +ALPHA = 0.75 +HIGH_PASS_CUTOFF = 48 +SAMPLE_RATE_16K = 16000 +RES_TYPE = "soxr_vhq" + + +class PreProcess: + def __init__(self, sr: int, exp_dir: str): + self.slicer = Slicer( + sr=sr, + threshold=-42, + min_length=1500, + min_interval=400, + hop_size=15, + max_sil_kept=500, + ) + self.sr = sr + self.b_high, self.a_high = signal.butter( + N=5, + Wn=HIGH_PASS_CUTOFF, + btype="high", + fs=self.sr, + ) + self.exp_dir = exp_dir + self.device = "cpu" + self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios") + self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k") + if pathlib.Path(self.gt_wavs_dir).exists(): + shutil.rmtree(self.gt_wavs_dir) + if pathlib.Path(self.wavs16k_dir).exists(): + shutil.rmtree(self.wavs16k_dir) + pathlib.Path(self.gt_wavs_dir).mkdir(parents=True) + pathlib.Path(self.wavs16k_dir).mkdir(parents=True) + + def _normalize_audio(self, audio: np.ndarray): + tmp_max = np.abs(audio).max() + if tmp_max > 2.5: + return None + return (audio / tmp_max * (MAX_AMPLITUDE * ALPHA)) + (1 - ALPHA) * audio + + def process_audio_segment( + self, + normalized_audio: np.ndarray, + sid: int, + idx0: int, + idx1: int, + normalization_mode: str, + ): + if normalized_audio is None: + logger.info("%d-%d-%d-filtered", sid, idx0, idx1) + return + if normalization_mode == "post": + normalized_audio = self._normalize_audio(normalized_audio) + wavfile.write( + os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{idx1}.wav"), + self.sr, + normalized_audio.astype(np.float32), + ) + audio_16k = librosa.resample( + normalized_audio, + orig_sr=self.sr, + target_sr=SAMPLE_RATE_16K, + res_type=RES_TYPE, + ) + wavfile.write( + os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{idx1}.wav"), + SAMPLE_RATE_16K, + audio_16k.astype(np.float32), + ) + + def simple_cut( + self, + audio: np.ndarray, + sid: int, + idx0: int, + chunk_len: float, + overlap_len: float, + normalization_mode: str, + ): + chunk_length = int(self.sr * chunk_len) + overlap_length = int(self.sr * overlap_len) + i = 0 + while i < len(audio): + chunk = audio[i : i + chunk_length] + if normalization_mode == "post": + chunk = self._normalize_audio(chunk) + if len(chunk) == chunk_length: + # full SR for training + wavfile.write( + os.path.join( + self.gt_wavs_dir, + f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav", + ), + self.sr, + chunk.astype(np.float32), + ) + # 16KHz for feature extraction + chunk_16k = librosa.resample( + chunk, + orig_sr=self.sr, + target_sr=SAMPLE_RATE_16K, + res_type=RES_TYPE, + ) + wavfile.write( + os.path.join( + self.wavs16k_dir, + f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav", + ), + SAMPLE_RATE_16K, + chunk_16k.astype(np.float32), + ) + i += chunk_length - overlap_length + + def process_audio( + self, + path: str, + idx0: int, + sid: int, + cut_preprocess: str, + process_effects: bool, + noise_reduction: bool, + reduction_strength: float, + chunk_len: float, + overlap_len: float, + normalization_mode: str, + ): + audio_length = 0 + try: + audio = load_audio(path, self.sr) + audio_length = librosa.get_duration(y=audio, sr=self.sr) + + if process_effects: + audio = signal.lfilter(self.b_high, self.a_high, audio) + if normalization_mode == "pre": + audio = self._normalize_audio(audio) + if noise_reduction: + audio = nr.reduce_noise( + y=audio, + sr=self.sr, + prop_decrease=reduction_strength, + ) + if cut_preprocess == "Skip": + # no cutting + self.process_audio_segment( + audio, + sid, + idx0, + 0, + normalization_mode, + ) + elif cut_preprocess == "Simple": + # simple + self.simple_cut( + audio, + sid, + idx0, + chunk_len, + overlap_len, + normalization_mode, + ) + elif cut_preprocess == "Automatic": + idx1 = 0 + # legacy + for audio_segment in self.slicer.slice(audio): + i = 0 + while True: + start = int(self.sr * (PERCENTAGE - OVERLAP) * i) + i += 1 + if ( + len(audio_segment[start:]) + > (PERCENTAGE + OVERLAP) * self.sr + ): + tmp_audio = audio_segment[ + start : start + int(PERCENTAGE * self.sr) + ] + self.process_audio_segment( + tmp_audio, + sid, + idx0, + idx1, + normalization_mode, + ) + idx1 += 1 + else: + tmp_audio = audio_segment[start:] + self.process_audio_segment( + tmp_audio, + sid, + idx0, + idx1, + normalization_mode, + ) + idx1 += 1 + break + except Exception as error: + logger.error( # noqa: TRY400 + "Error processing audio: %s. One or more audio files may not be" + " included as pre-processed data.", + error, + ) + return audio_length + + +def format_duration(seconds): + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + seconds = int(seconds % 60) + return f"{hours:02}:{minutes:02}:{seconds:02}" + + +def save_dataset_duration_and_sample_rate( + file_path, + dataset_duration, + sample_rate, +) -> None: + try: + with pathlib.Path(file_path).open() as f: + data = json.load(f) + except FileNotFoundError: + data = {} + + formatted_duration = format_duration(dataset_duration) + new_data = { + "total_dataset_duration": formatted_duration, + "total_seconds": dataset_duration, + "sample_rate": sample_rate, + } + data.update(new_data) + + with pathlib.Path(file_path).open("w") as f: + json.dump(data, f, indent=4) + + +def process_audio_wrapper(args): + ( + pp, + file, + cut_preprocess, + process_effects, + noise_reduction, + reduction_strength, + chunk_len, + overlap_len, + normalization_mode, + ) = args + file_path, idx0, sid = file + return pp.process_audio( + file_path, + idx0, + sid, + cut_preprocess, + process_effects, + noise_reduction, + reduction_strength, + chunk_len, + overlap_len, + normalization_mode, + ) + + +def get_file_hash(file: str, size: int = 5) -> str: + + with pathlib.Path(file).open("rb") as fp: + file_hash = hashlib.file_digest( + fp, + lambda: hashlib.blake2b(digest_size=size), # type: ignore[reportArgumentType] + ) + return file_hash.hexdigest() + + +def preprocess_training_set( + input_root: str, + sr: int, + num_processes: int, + exp_dir: str, + cut_preprocess: str, + process_effects: bool, + noise_reduction: bool, + reduction_strength: float, + chunk_len: float, + overlap_len: float, + normalization_mode: str, +): + + static_ffmpeg.add_paths(weak=True) + + import pydub.utils as pydub_utils # noqa: PLC0415 + + start_time = time.time() + pp = PreProcess(sr, exp_dir) + logger.info("Starting preprocess with %d processes...", num_processes) + + files = [] + idx = 0 + + for root, _, filenames in os.walk(input_root): + try: + sid = 0 if root == input_root else int(os.path.basename(root)) + for f in filenames: + f_path = os.path.join(root, f) + audio_info = pydub_utils.mediainfo(f_path) + if audio_info["format_name"] in { + AudioExt.WAV, + AudioExt.FLAC, + AudioExt.MP3, + AudioExt.OGG, + }: + files.append((f_path, idx, sid)) + idx += 1 + elif ( + AudioExt.M4A in audio_info["format_name"] + or audio_info["format_name"] == AudioExt.AAC + ): + base_path = os.path.splitext(f_path)[0] + file_hash = get_file_hash(f_path) + wav_path = f"{base_path}_{file_hash}.wav" + if not pathlib.Path(wav_path).exists(): + logger.info("[~] Converting audio file: %s to wav format...", f) + _, stderr = ( + ffmpeg.input(f_path) + .output(filename=wav_path, f="wav") + .run( + overwrite_output=True, + quiet=True, + ) + ) + logger.info("FFmpeg stderr:\n%s", stderr.decode("utf-8")) + + files.append((wav_path, idx, sid)) + idx += 1 + else: + + logger.error( + "File %s is not an audio file with a valid format. Skipping" + " file.", + f, + ) + + except ValueError: + logger.error( # noqa: TRY400 + "Speaker ID folder is expected to be integer, got '%s' instead." + " Skipping folder.", + os.path.basename(root), + ) + + # print(f"Number of files: {len(files)}") + audio_length = [] + + remove_sox_libmso6_from_ld_preload() + with ( + tqdm(total=len(files)) as pbar, + concurrent.futures.ProcessPoolExecutor(max_workers=num_processes) as executor, + ): + futures = [ + executor.submit( + process_audio_wrapper, + ( + pp, + file, + cut_preprocess, + process_effects, + noise_reduction, + reduction_strength, + chunk_len, + overlap_len, + normalization_mode, + ), + ) + for file in files + ] + for future in concurrent.futures.as_completed(futures): + audio_length.append(future.result()) + pbar.update(1) + + audio_length = sum(audio_length) + save_dataset_duration_and_sample_rate( + os.path.join(exp_dir, "model_info.json"), + dataset_duration=audio_length, + sample_rate=sr, + ) + elapsed_time = time.time() - start_time + logger.info( + "Preprocess completed in %.2f seconds on %s seconds of audio.", + elapsed_time, + format_duration(audio_length), + ) +h), + ) diff --git a/rvc_logic/rvc/train/preprocess/slicer.py b/rvc_logic/rvc/train/preprocess/slicer.py new file mode 100644 index 0000000000000000000000000000000000000000..0c5775c4bb2b99b147bf016c45be49c1f3261572 --- /dev/null +++ b/rvc_logic/rvc/train/preprocess/slicer.py @@ -0,0 +1,240 @@ +import numpy as np + + +class Slicer: + """ + A class for slicing audio waveforms into segments based on silence detection. + + Attributes: + sr (int): Sampling rate of the audio waveform. + threshold (float): RMS threshold for silence detection, in dB. + min_length (int): Minimum length of a segment, in milliseconds. + min_interval (int): Minimum interval between segments, in milliseconds. + hop_size (int): Hop size for RMS calculation, in milliseconds. + max_sil_kept (int): Maximum length of silence to keep at the beginning or end of a segment, in milliseconds. + + Methods: + slice(waveform): Slices the given waveform into segments. + + """ + + def __init__( + self, + sr: int, + threshold: float = -40.0, + min_length: int = 5000, + min_interval: int = 300, + hop_size: int = 20, + max_sil_kept: int = 5000, + ): + """ + Initializes a Slicer object. + + Args: + sr (int): Sampling rate of the audio waveform. + threshold (float, optional): RMS threshold for silence detection, in dB. Defaults to -40.0. + min_length (int, optional): Minimum length of a segment, in milliseconds. Defaults to 5000. + min_interval (int, optional): Minimum interval between segments, in milliseconds. Defaults to 300. + hop_size (int, optional): Hop size for RMS calculation, in milliseconds. Defaults to 20. + max_sil_kept (int, optional): Maximum length of silence to keep at the beginning or end of a segment, in milliseconds. Defaults to 5000. + + Raises: + ValueError: If the input parameters are not valid. + + """ + if not min_length >= min_interval >= hop_size: + raise ValueError("min_length >= min_interval >= hop_size is required") + if not max_sil_kept >= hop_size: + raise ValueError("max_sil_kept >= hop_size is required") + + # Convert time-based parameters to sample-based parameters + min_interval = sr * min_interval / 1000 + self.threshold = 10 ** (threshold / 20.0) + self.hop_size = round(sr * hop_size / 1000) + self.win_size = min(round(min_interval), 4 * self.hop_size) + self.min_length = round(sr * min_length / 1000 / self.hop_size) + self.min_interval = round(min_interval / self.hop_size) + self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) + + def _apply_slice(self, waveform, begin, end): + """ + Applies a slice to the waveform. + + Args: + waveform (numpy.ndarray): The waveform to slice. + begin (int): Start frame index. + end (int): End frame index. + + """ + start_idx = begin * self.hop_size + if len(waveform.shape) > 1: + end_idx = min(waveform.shape[1], end * self.hop_size) + return waveform[:, start_idx:end_idx] + end_idx = min(waveform.shape[0], end * self.hop_size) + return waveform[start_idx:end_idx] + + def slice(self, waveform): + """ + Slices the given waveform into segments. + + Args: + waveform (numpy.ndarray): The waveform to slice. + + """ + # Calculate RMS for each frame + samples = waveform.mean(axis=0) if len(waveform.shape) > 1 else waveform + if samples.shape[0] <= self.min_length: + return [waveform] + + rms_list = get_rms( + y=samples, + frame_length=self.win_size, + hop_length=self.hop_size, + ).squeeze(0) + + # Detect silence segments and mark them + sil_tags = [] + silence_start, clip_start = None, 0 + for i, rms in enumerate(rms_list): + # If current frame is silent + if rms < self.threshold: + if silence_start is None: + silence_start = i + continue + + # If current frame is not silent + if silence_start is None: + continue + + # Check if current silence segment is leading silence or need to slice + is_leading_silence = silence_start == 0 and i > self.max_sil_kept + need_slice_middle = ( + i - silence_start >= self.min_interval + and i - clip_start >= self.min_length + ) + + # If not leading silence and not need to slice middle + if not is_leading_silence and not need_slice_middle: + silence_start = None + continue + + # Handle different cases of silence segments + if i - silence_start <= self.max_sil_kept: + # Short silence + pos = rms_list[silence_start : i + 1].argmin() + silence_start + if silence_start == 0: + sil_tags.append((0, pos)) + else: + sil_tags.append((pos, pos)) + clip_start = pos + elif i - silence_start <= self.max_sil_kept * 2: + # Medium silence + pos = rms_list[ + i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 + ].argmin() + pos += i - self.max_sil_kept + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) + if silence_start == 0: + sil_tags.append((0, pos_r)) + clip_start = pos_r + else: + sil_tags.append((min(pos_l, pos), max(pos_r, pos))) + clip_start = max(pos_r, pos) + else: + # Long silence + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) + if silence_start == 0: + sil_tags.append((0, pos_r)) + else: + sil_tags.append((pos_l, pos_r)) + clip_start = pos_r + silence_start = None + + # Handle trailing silence + total_frames = rms_list.shape[0] + if ( + silence_start is not None + and total_frames - silence_start >= self.min_interval + ): + silence_end = min(total_frames, silence_start + self.max_sil_kept) + pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start + sil_tags.append((pos, total_frames + 1)) + + # Extract segments based on silence tags + if not sil_tags: + return [waveform] + chunks = [] + if sil_tags[0][0] > 0: + chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) + + for i in range(len(sil_tags) - 1): + chunks.append( + self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]), + ) + + if sil_tags[-1][1] < total_frames: + chunks.append( + self._apply_slice(waveform, sil_tags[-1][1], total_frames), + ) + + return chunks + + +def get_rms( + y, + frame_length=2048, + hop_length=512, + pad_mode="constant", +): + """ + Calculates the root mean square (RMS) of a waveform. + + Args: + y (numpy.ndarray): The waveform. + frame_length (int, optional): The length of the frame in samples. Defaults to 2048. + hop_length (int, optional): The hop length between frames in samples. Defaults to 512. + pad_mode (str, optional): The padding mode used for the waveform. Defaults to "constant". + + """ + padding = (int(frame_length // 2), int(frame_length // 2)) + y = np.pad(y, padding, mode=pad_mode) + + axis = -1 + out_strides = y.strides + tuple([y.strides[axis]]) + x_shape_trimmed = list(y.shape) + x_shape_trimmed[axis] -= frame_length - 1 + out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) + xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) + + if axis < 0: + target_axis = axis - 1 + else: + target_axis = axis + 1 + + xw = np.moveaxis(xw, -1, target_axis) + slices = [slice(None)] * xw.ndim + slices[axis] = slice(0, None, hop_length) + x = xw[tuple(slices)] + + power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) + return np.sqrt(power) diff --git a/rvc_logic/rvc/train/process/change_info.py b/rvc_logic/rvc/train/process/change_info.py new file mode 100644 index 0000000000000000000000000000000000000000..b045b7cba2e3ad810af46b1ee95bc59e6c826281 --- /dev/null +++ b/rvc_logic/rvc/train/process/change_info.py @@ -0,0 +1,24 @@ +import os +import pathlib + +import torch + + +def change_info(path, info, name): + try: + ckpt = torch.load(path, map_location="cpu", weights_only=False) + ckpt["info"] = info + + if not name: + name = os.path.splitext(os.path.basename(path))[0] + + target_dir = os.path.join("logs", name) + pathlib.Path(target_dir).mkdir(exist_ok=True, parents=True) + + torch.save(ckpt, os.path.join(target_dir, f"{name}.pth")) + + return "Success." + + except Exception as error: + print(f"An error occurred while changing the info: {error}") + return f"Error: {error}" diff --git a/rvc_logic/rvc/train/process/extract_index.py b/rvc_logic/rvc/train/process/extract_index.py new file mode 100644 index 0000000000000000000000000000000000000000..0a0bdd98eeff5f6b6b69c8340c780db17631c73d --- /dev/null +++ b/rvc_logic/rvc/train/process/extract_index.py @@ -0,0 +1,94 @@ +import json +import logging +import os +import pathlib +import sys +from multiprocessing import cpu_count + +from sklearn.cluster import MiniBatchKMeans + +import numpy as np + +import faiss + +logger = logging.getLogger(__name__) + + +def main(exp_dir: str, index_algorithm: str) -> None: + + try: + model_info = json.load( + pathlib.Path(os.path.join(exp_dir, "model_info.json")).open() + ) + embedder_model = model_info["embedder_model"] + custom_embedder_model_hash = model_info.get("custom_embedder_model_hash", None) + if custom_embedder_model_hash is not None: + embedder_model = f"custom_{custom_embedder_model_hash}" + feature_dir = os.path.join(exp_dir, f"{embedder_model}_extracted") + + if not pathlib.Path(feature_dir).exists(): + logger.error( + "Feature to generate index file not found at %s. Did you run" + " preprocessing and feature extraction steps?", + feature_dir, + ) + sys.exit(1) + model_name = os.path.basename(exp_dir) + + index_filename_added = f"{model_name}.index" + index_filepath_added = os.path.join(exp_dir, index_filename_added) + + if pathlib.Path(index_filepath_added).exists(): + pass + else: + npys = [] + listdir_res = sorted(os.listdir(feature_dir)) + + for name in listdir_res: + file_path = os.path.join(feature_dir, name) + phone = np.load(file_path) + npys.append(phone) + + big_npy = np.concatenate(npys, axis=0) + + big_npy_idx = np.arange(big_npy.shape[0]) + np.random.shuffle(big_npy_idx) + big_npy = big_npy[big_npy_idx] + + if big_npy.shape[0] > 2e5 and ( + index_algorithm == "Auto" or index_algorithm == "KMeans" + ): + big_npy = ( + MiniBatchKMeans( + n_clusters=10000, + verbose=True, + batch_size=256 * cpu_count(), + compute_labels=False, + init="random", + ) + .fit(big_npy) + .cluster_centers_ + ) + + n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39) + + # index_added + index_added = faiss.index_factory(768, f"IVF{n_ivf},Flat") + index_ivf_added = faiss.extract_index_ivf(index_added) + index_ivf_added.nprobe = 1 + index_added.train(big_npy) + + batch_size_add = 8192 + for i in range(0, big_npy.shape[0], batch_size_add): + index_added.add(big_npy[i : i + batch_size_add]) + + faiss.write_index(index_added, index_filepath_added) + logger.info("Saved index file '%s'", index_filepath_added) + + except Exception as error: + logger.error( # noqa: TRY400 + "An error occurred extracting the index: %s. If you are running this code" + " in a virtual environment, make sure you have enough GPU available to" + " generate the Index file.", + error, + ) diff --git a/rvc_logic/rvc/train/process/extract_model.py b/rvc_logic/rvc/train/process/extract_model.py new file mode 100644 index 0000000000000000000000000000000000000000..daa0a5663d8f0b787eaed95542f92dfdd2d99ce7 --- /dev/null +++ b/rvc_logic/rvc/train/process/extract_model.py @@ -0,0 +1,118 @@ +import datetime +import hashlib +import json +import logging +import os +import pathlib +import sys +from collections import OrderedDict + +import torch + +logger = logging.getLogger(__name__) + +now_dir = pathlib.Path.cwd() +sys.path.append(str(now_dir)) + + +def replace_keys_in_dict(d, old_key_part, new_key_part): + if isinstance(d, OrderedDict): + updated_dict = OrderedDict() + else: + updated_dict = {} + for key, value in d.items(): + new_key = key.replace(old_key_part, new_key_part) + if isinstance(value, dict): + value = replace_keys_in_dict(value, old_key_part, new_key_part) + updated_dict[new_key] = value + return updated_dict + + +def extract_model( + ckpt, + sr, + name, + model_path, + epoch, + step, + hps, + overtrain_info, + vocoder, + pitch_guidance=True, + version="v2", +): + try: + + model_dir = os.path.dirname(model_path) + pathlib.Path(model_dir).mkdir(exist_ok=True, parents=True) + + if pathlib.Path(os.path.join(model_dir, "model_info.json")).exists(): + with pathlib.Path(os.path.join(model_dir, "model_info.json")).open( + "r" + ) as f: + data = json.load(f) + dataset_length = data.get("total_dataset_duration", None) + embedder_model = data.get("embedder_model", None) + speakers_id = data.get("speakers_id", 1) + model_author = data.get("model_author", None) + else: + dataset_length = None + + opt = OrderedDict( + weight={ + key: value.half() for key, value in ckpt.items() if "enc_q" not in key + }, + ) + opt["config"] = [ + hps.data.filter_length // 2 + 1, + 32, + hps.model.inter_channels, + hps.model.hidden_channels, + hps.model.filter_channels, + hps.model.n_heads, + hps.model.n_layers, + hps.model.kernel_size, + hps.model.p_dropout, + hps.model.resblock, + hps.model.resblock_kernel_sizes, + hps.model.resblock_dilation_sizes, + hps.model.upsample_rates, + hps.model.upsample_initial_channel, + hps.model.upsample_kernel_sizes, + hps.model.spk_embed_dim, + hps.model.gin_channels, + hps.data.sample_rate, + ] + + opt["epoch"] = epoch + opt["step"] = step + opt["sr"] = sr + opt["f0"] = pitch_guidance + opt["version"] = version + opt["creation_date"] = datetime.datetime.now().isoformat() + + hash_input = f"{name}-{epoch}-{step}-{sr}-{version}-{opt['config']}" + opt["model_hash"] = hashlib.sha256(hash_input.encode()).hexdigest() + opt["overtrain_info"] = overtrain_info + opt["dataset_length"] = dataset_length + opt["model_name"] = name + opt["author"] = model_author + opt["embedder_model"] = embedder_model + opt["speakers_id"] = speakers_id + opt["vocoder"] = vocoder + torch.save( + replace_keys_in_dict( + replace_keys_in_dict( + opt, + ".parametrizations.weight.original1", + ".weight_v", + ), + ".parametrizations.weight.original0", + ".weight_g", + ), + model_path, + ) + logger.info("Saved model '%s' (epoch %s and step %s)", model_path, epoch, step) + + except Exception as error: + logger.error("An error occurred extracting the model: %s", error) diff --git a/rvc_logic/rvc/train/process/model_blender.py b/rvc_logic/rvc/train/process/model_blender.py new file mode 100644 index 0000000000000000000000000000000000000000..703386d652a648aa03c410a97baa4beceaf32512 --- /dev/null +++ b/rvc_logic/rvc/train/process/model_blender.py @@ -0,0 +1,78 @@ +import os +from collections import OrderedDict + +import torch + + +def extract(ckpt): + a = ckpt["model"] + opt = OrderedDict() + opt["weight"] = {} + for key in a.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = a[key] + return opt + + +def model_blender(name, path1, path2, ratio): + try: + message = f"Model {path1} and {path2} are merged with alpha {ratio}." + ckpt1 = torch.load(path1, map_location="cpu", weights_only=False) + ckpt2 = torch.load(path2, map_location="cpu", weights_only=False) + + sr1 = str(ckpt1["sr"]).lower().replace("k", "000") + sr2 = str(ckpt2["sr"]).lower().replace("k", "000") + + if sr1 != sr2: + print( + f"Sample rate of {path1} {sr1} does not match the sample rate of" + f" {path2} {sr2}." + ) + return "The sample rates of the two models are not the same." + + cfg = ckpt1["config"] + cfg_f0 = ckpt1["f0"] + cfg_version = ckpt1["version"] + cfg_sr = sr1 + vocoder = ckpt1.get("vocoder", "HiFi-GAN") + + if "model" in ckpt1: + ckpt1 = extract(ckpt1) + else: + ckpt1 = ckpt1["weight"] + if "model" in ckpt2: + ckpt2 = extract(ckpt2) + else: + ckpt2 = ckpt2["weight"] + + if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())): + return "Fail to merge the models. The model architectures are not the same." + + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt1.keys(): + if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape: + min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0]) + opt["weight"][key] = ( + ratio * (ckpt1[key][:min_shape0].float()) + + (1 - ratio) * (ckpt2[key][:min_shape0].float()) + ).half() + else: + opt["weight"][key] = ( + ratio * (ckpt1[key].float()) + (1 - ratio) * (ckpt2[key].float()) + ).half() + + opt["config"] = cfg + opt["sr"] = cfg_sr + opt["f0"] = cfg_f0 + opt["version"] = cfg_version + opt["info"] = message + + opt["vocoder"] = vocoder + torch.save(opt, os.path.join("logs", f"{name}.pth")) + print(message) + return message, os.path.join("logs", f"{name}.pth") + except Exception as error: + print(f"An error occurred blending the models: {error}") + return error diff --git a/rvc_logic/rvc/train/process/model_information.py b/rvc_logic/rvc/train/process/model_information.py new file mode 100644 index 0000000000000000000000000000000000000000..74a5e7729f70ab7483e4ea4a9516a3f5e569e4d5 --- /dev/null +++ b/rvc_logic/rvc/train/process/model_information.py @@ -0,0 +1,50 @@ +from datetime import datetime + +import torch + + +def prettify_date(date_str): + if date_str is None: + return "None" + try: + date_time_obj = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%f") + return date_time_obj.strftime("%Y-%m-%d %H:%M:%S") + except ValueError: + return "Invalid date format" + + +def model_information(path): + model_data = torch.load(path, map_location="cpu", weights_only=False) + + print(f"Loaded model from {path}") + + model_name = model_data.get("model_name", "None") + epochs = model_data.get("epoch", "None") + steps = model_data.get("step", "None") + sr = model_data.get("sr", "None") + f0 = model_data.get("f0", "None") + dataset_length = model_data.get("dataset_length", "None") + vocoder = model_data.get("vocoder", "None") + creation_date = model_data.get("creation_date", "None") + model_hash = model_data.get("model_hash", None) + overtrain_info = model_data.get("overtrain_info", "None") + model_author = model_data.get("author", "None") + embedder_model = model_data.get("embedder_model", "None") + speakers_id = model_data.get("speakers_id", 0) + + creation_date_str = prettify_date(creation_date) if creation_date else "None" + + return ( + f"Model Name: {model_name}\n" + f"Model Creator: {model_author}\n" + f"Epochs: {epochs}\n" + f"Steps: {steps}\n" + f"Vocoder: {vocoder}\n" + f"Sampling Rate: {sr}\n" + f"Dataset Length: {dataset_length}\n" + f"Creation Date: {creation_date_str}\n" + f"Overtrain Info: {overtrain_info}\n" + f"Embedder Model: {embedder_model}\n" + f"Max Speakers ID: {speakers_id}" + f"Hash: {model_hash}\n" + ) diff --git a/rvc_logic/rvc/train/train.py b/rvc_logic/rvc/train/train.py new file mode 100644 index 0000000000000000000000000000000000000000..c9af0dbc2de83dc1c554ba71a7b0adcb4abcd396 --- /dev/null +++ b/rvc_logic/rvc/train/train.py @@ -0,0 +1,1113 @@ +import datetime +import glob +import json +import logging +import os +import pathlib +import shutil +import signal +import sys +from collections import deque +from random import randint, shuffle +from time import time as ttime + +import numpy as np +from tqdm import tqdm + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter + +now_dir = pathlib.Path.cwd() +sys.path.append(os.path.join(str(now_dir))) + +# Zluda hijack +import rvc_logic.rvc.lib.zluda +from rvc_logic.common import TRAINING_MODELS_DIR +from rvc_logic.rvc.common import RVC_TRAINING_MODELS_DIR +from rvc_logic.rvc.lib.algorithm import commons +from rvc_logic.rvc.train.losses import ( + discriminator_loss, + feature_loss, + generator_loss, + kl_loss, +) +from rvc_logic.rvc.train.mel_processing import ( + MultiScaleMelSpectrogramLoss, + mel_spectrogram_torch, + spec_to_mel_torch, +) +from rvc_logic.rvc.train.process.extract_model import extract_model +from rvc_logic.rvc.train.utils import ( + HParams, + latest_checkpoint_path, + load_checkpoint, + load_wav_to_torch, + plot_spectrogram_to_numpy, + remove_sox_libmso6_from_ld_preload, + save_checkpoint, + summarize, +) + +logging.getLogger("torch").setLevel(logging.ERROR) +logger = logging.getLogger(__name__) + +torch.backends.cudnn.deterministic = False +torch.backends.cudnn.benchmark = True +torch.multiprocessing.set_start_method("spawn", force=True) +os.environ["USE_LIBUV"] = "0" if sys.platform == "win32" else "1" + +randomized = True +d_lr_coeff = 1.0 +g_lr_coeff = 1.0 +d_step_per_g_step = 1 +bf16_adamw = False +global_step = 0 +lowest_g_value = {"value": float("inf"), "epoch": 0} +lowest_d_value = {"value": float("inf"), "epoch": 0} +consecutive_increases_gen = 0 +consecutive_increases_disc = 0 + +avg_losses = { + "grad_d_50": deque(maxlen=50), + "grad_g_50": deque(maxlen=50), + "disc_loss_50": deque(maxlen=50), + "adv_loss_50": deque(maxlen=50), + "fm_loss_50": deque(maxlen=50), + "kl_loss_50": deque(maxlen=50), + "mel_loss_50": deque(maxlen=50), + "gen_loss_50": deque(maxlen=50), +} + + +class EpochRecorder: + """ + Records the time elapsed per epoch. + """ + + def __init__(self): + self.last_time = ttime() + + def record(self): + """ + Records the elapsed time and returns a formatted string. + """ + now_time = ttime() + elapsed_time = now_time - self.last_time + self.last_time = now_time + elapsed_time = round(elapsed_time, 1) + elapsed_time_str = str(datetime.timedelta(seconds=int(elapsed_time))) + current_time = datetime.datetime.now().strftime("%H:%M:%S") + return f"time={current_time} | training_speed={elapsed_time_str}" + + +def main( + model_name: str, + sample_rate: int | None, + vocoder: str, + total_epoch: int, + batch_size: int, + save_every_epoch: int, + save_only_latest: bool, + save_every_weights: bool, + pretrain_g: str, + pretrain_d: str, + overtraining_detector: bool, + overtraining_threshold: int, + cleanup: bool, + cache_data_in_gpu: bool, + checkpointing: bool, + device_type: str, + gpus: set[int] | None, + precision: str = "fp32", +) -> None: + """ + Start the training process. + + Raises: + RuntimeError: If the sample rate of the pretrained model does not match the dataset audio sample rate. + + """ + remove_sox_libmso6_from_ld_preload() + experiment_dir = os.path.join(TRAINING_MODELS_DIR, model_name) + config_save_path = os.path.join(experiment_dir, "config.json") + + # Use a Manager to create a shared list + manager = mp.Manager() + global_gen_loss = manager.list([0] * total_epoch) + global_disc_loss = manager.list([0] * total_epoch) + + try: + with pathlib.Path(config_save_path).open() as f: + config = json.load(f) + config = HParams(**config) + except FileNotFoundError: + logger.error( + "Config file not found at %s. Did you run preprocessing and feature" + " extraction steps?", + config_save_path, + ) + sys.exit(1) + sample_rate = config.data.sample_rate if sample_rate is None else sample_rate + + if ( + precision == "bf16" + and torch.cuda.is_available() + and torch.cuda.is_bf16_supported() + ): + train_dtype = torch.bfloat16 + elif precision == "fp16" and torch.cuda.is_available(): + train_dtype = torch.float16 + else: + train_dtype = torch.float32 + + config.data.training_files = os.path.join(experiment_dir, "filelist.txt") + + # Set up distributed training environment for master node. + # master node is localhost because we are running on a single local + # machine. master port is randomly selected + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(randint(20000, 55555)) + logger.info("MASTER_PORT: %s", os.environ["MASTER_PORT"]) + # Check sample rate + wavs = glob.glob( + os.path.join(experiment_dir, "sliced_audios", "*.wav"), + ) + if wavs: + _, sr = load_wav_to_torch(wavs[0]) + if sr != sample_rate: + error_message = ( + f"Error: Pretrained model sample rate ({sample_rate} Hz) does not match" + f" dataset audio sample rate ({sr} Hz)." + ) + raise RuntimeError(error_message) + else: + logger.warning("No wav file found.") + + device = torch.device(device_type) + gpus = gpus or {0} + n_gpus = len(gpus) + + if device.type == "cpu": + logger.warning("Training with CPU, this will take a long time.") + + def start() -> None: + """Start the training process with multi-GPU support or CPU.""" + children = [] + pid_data = {"process_pids": []} + with pathlib.Path(config_save_path).open("r") as pid_file: + try: + existing_data = json.load(pid_file) + pid_data.update(existing_data) + except json.JSONDecodeError: + pass + with pathlib.Path(config_save_path).open("w") as pid_file: + for rank, device_id in enumerate(gpus): + subproc = mp.Process( + target=run, + args=( + rank, + n_gpus, + experiment_dir, + pretrain_g, + pretrain_d, + total_epoch, + save_every_weights, + config, + device, + device_id, + model_name, + sample_rate, + vocoder, + batch_size, + save_every_epoch, + save_only_latest, + overtraining_detector, + overtraining_threshold, + checkpointing, + cache_data_in_gpu, + global_gen_loss, + global_disc_loss, + train_dtype, + ), + ) + children.append(subproc) + subproc.start() + pid_data["process_pids"].append(subproc.pid) + json.dump(pid_data, pid_file, indent=4) + cancel_signal = signal.SIGTERM if os.name == "nt" else -signal.SIGTERM + error_codes = [] + for i in range(n_gpus): + children[i].join() + exit_code = children[i].exitcode + if exit_code != 0: + logger.warning( + "Process running on device %s exited with code %s.", + device_id, + exit_code, + ) + if exit_code != cancel_signal: + error_codes.append(exit_code) + if error_codes: + err_msg = ( + "One or more training processes failed. See the logs or console for" + " details." + ) + raise RuntimeError(err_msg) + + if cleanup: + logger.info("Removing files from the prior training attempt...") + + # Clean up unnecessary files + for entry in os.scandir(os.path.join(TRAINING_MODELS_DIR, model_name)): + if entry.is_file(): + _, file_extension = os.path.splitext(entry.name) + if file_extension in {".0", ".pth", ".index"}: + pathlib.Path(entry.path).unlink() + elif entry.is_dir() and entry.name == "eval": + shutil.rmtree(entry.path) + + logger.info("Cleanup done!") + start() + + +def run( + rank, + n_gpus, + experiment_dir, + pretrain_g, + pretrain_d, + custom_total_epoch, + custom_save_every_weights, + config, + device, + device_id, + model_name, + sample_rate, + vocoder, + batch_size, + save_every_epoch, + save_only_latest, + overtraining_detector, + overtraining_threshold, + checkpointing, + cache_data_in_gpu, + global_gen_loss, + global_disc_loss, + train_dtype, +): + """ + Runs the training loop on a specific GPU or CPU. + + Args: + rank (int): The rank of the current process within the distributed training setup. + n_gpus (int): The total number of GPUs available for training. + experiment_dir (str): The directory where experiment logs and checkpoints will be saved. + pretrain_g (str): Path to the pre-trained generator model. + pretrain_d (str): Path to the pre-trained discriminator model. + custom_total_epoch (int): The total number of epochs for training. + custom_save_every_weights (int): The interval (in epochs) at which to save model weights. + config (object): Configuration object containing training parameters. + device (torch.device): The device to use for training (CPU or GPU). + + """ + global global_step, optimizer, lowest_d_value, lowest_g_value, consecutive_increases_gen, consecutive_increases_disc + + if rank == 0: + writer_eval = SummaryWriter(log_dir=os.path.join(experiment_dir, "eval")) + else: + writer_eval = None + + # Initialize distributed training environment for child node. + dist.init_process_group( + backend="gloo" if sys.platform == "win32" or device.type != "cuda" else "nccl", + init_method="env://", + world_size=n_gpus if device.type == "cuda" else 1, + rank=rank if device.type == "cuda" else 0, + ) + + torch.manual_seed(config.train.seed) + + if device.type == "cuda": + torch.cuda.set_device(device_id) + + # Create datasets and dataloaders + from rvc_logic.rvc.train.data_utils import ( + DistributedBucketSampler, + TextAudioCollateMultiNSFsid, + TextAudioLoaderMultiNSFsid, + ) + + train_dataset = TextAudioLoaderMultiNSFsid(config.data) + collate_fn = TextAudioCollateMultiNSFsid() + train_sampler = DistributedBucketSampler( + train_dataset, + batch_size * n_gpus, + [50, 100, 200, 300, 400, 500, 600, 700, 800, 900], + num_replicas=n_gpus, + rank=rank, + shuffle=True, + ) + + train_loader = DataLoader( + train_dataset, + num_workers=4, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + batch_sampler=train_sampler, + persistent_workers=True, + prefetch_factor=8, + ) + if len(train_loader) < 3: + logger.error( + "Not enough data in the training set. Perhaps you forgot to slice the" + " audio files in preprocess?", + ) + sys.exit(1) + + # defaults + embedder_name = "contentvec" + spk_dim = config.model.spk_embed_dim # 109 default speakers + + model_info_path = os.path.join(experiment_dir, "model_info.json") + try: + with pathlib.Path(model_info_path).open("r") as f: + model_info = json.load(f) + embedder_name = model_info["embedder_model"] + spk_dim = model_info["speakers_id"] + except Exception as e: + logger.error("Could not load model info file: %s. Using defaults.", e) + + # Try to load speaker dim from latest checkpoint or pretrain_g + try: + last_g = latest_checkpoint_path(experiment_dir, "G_*.pth") + chk_path = last_g or (pretrain_g if pretrain_g not in {"", "None"} else None) + + if chk_path: + ckpt = torch.load(chk_path, map_location="cpu", weights_only=False) + spk_dim = ckpt["model"]["emb_g.weight"].shape[0] + del ckpt + except Exception as e: + logger.error( + "Failed to load checkpoint: %s. Using default number of speakers.", e + ) + + # update config before the model init + logger.info("Initializing the generator with %s speakers.", spk_dim) + config.model.spk_embed_dim = spk_dim + + # Initialize models and optimizers + from rvc_logic.rvc.lib.algorithm.discriminators import MultiPeriodDiscriminator + from rvc_logic.rvc.lib.algorithm.synthesizers import Synthesizer + + # NOTE checkingpointing here means whether or not activations are + # saved during forward pass for backpropagation during backward pass + + net_g = Synthesizer( + config.data.filter_length // 2 + 1, + config.train.segment_size // config.data.hop_length, + **config.model, + use_f0=True, + sr=sample_rate, + vocoder=vocoder, + checkpointing=checkpointing, + randomized=randomized, + ) + if vocoder == "RefineGAN": + disc_version = "v3" + fn_mel_loss = MultiScaleMelSpectrogramLoss(sample_rate=sample_rate) + logger.info("Using Multi-Scale Mel loss function") + else: + disc_version = "v2" + fn_mel_loss = torch.nn.L1Loss() + logger.info("Using Single-Scale Mel loss function") + net_d = MultiPeriodDiscriminator( + config.model.use_spectral_norm, + checkpointing=checkpointing, + version=disc_version, + ) + + if device.type == "cuda": + net_g = net_g.cuda(device_id) + net_d = net_d.cuda(device_id) + else: + net_g = net_g.to(device) + net_d = net_d.to(device) + + if bf16_adamw and train_dtype == torch.bfloat16: + logger.info("Using BFloat16 AdamW optimizer") + from rvc_logic.rvc.train.anyprecision_optimizer import AnyPrecisionAdamW + + optimizer = AnyPrecisionAdamW + else: + logger.info("Using AdamW optimizer") + optimizer = torch.optim.AdamW + + optim_g = optimizer( + net_g.parameters(), + config.train.learning_rate * g_lr_coeff, + betas=config.train.betas, + eps=config.train.eps, + ) + optim_d = optimizer( + net_d.parameters(), + config.train.learning_rate * d_lr_coeff, + betas=config.train.betas, + eps=config.train.eps, + ) + + # Wrap models with DDP for multi-gpu processing + if n_gpus > 1 and device.type == "cuda": + net_g = DDP(net_g, device_ids=[device_id]) + net_d = DDP(net_d, device_ids=[device_id]) + + if rank == 0 and device.type == "cuda" and train_dtype == torch.bfloat16: + logger.info("Using BFloat16 for training.") + elif rank == 0 and device.type == "cuda" and train_dtype == torch.float16: + logger.info("Using Float16 for training.") + + # Load checkpoint if available + scaler_dict = {} + try: + _, _, _, epoch_str, lowest_d_value, consecutive_increases_disc, scaler_dict = ( + load_checkpoint( + latest_checkpoint_path(experiment_dir, "D_*.pth"), + net_d, + optim_d, + ) + ) + _, _, _, epoch_str, lowest_g_value, consecutive_increases_gen, _ = ( + load_checkpoint( + latest_checkpoint_path(experiment_dir, "G_*.pth"), + net_g, + optim_g, + ) + ) + epoch_str += 1 + global_step = (epoch_str - 1) * len(train_loader) + logger.info("Resumed from epoch %s", epoch_str) + logger.info( + "Loaded lowest generator loss %.3f at epoch %s, lowest discriminator loss" + " %.3f at epoch %s", + lowest_g_value["value"], + lowest_g_value["epoch"], + lowest_d_value["value"], + lowest_d_value["epoch"], + ) + logger.info( + "Loaded consecutive increases gen %d, consecutive increases disc %d", + consecutive_increases_gen, + consecutive_increases_disc, + ) + + except Exception: + epoch_str = 1 + global_step = 0 + if pretrain_g not in {"", "None"}: + if rank == 0: + logger.info("Loaded pretrained (G) '%s'", pretrain_g) + try: + ckpt = torch.load(pretrain_g, map_location="cpu", weights_only=False)[ + "model" + ] + if hasattr(net_g, "module"): + net_g.module.load_state_dict(ckpt) + else: + net_g.load_state_dict(ckpt) + del ckpt + except RuntimeError: + logger.error( # noqa: TRY400 + "The parameters of the pretrain model such as the sample rate or" + " architecture do not match the selected model.", + ) + sys.exit(1) + + if pretrain_d not in {"", "None"}: + if rank == 0: + logger.info("Loaded pretrained (D) '%s'", pretrain_d) + try: + ckpt = torch.load(pretrain_d, map_location="cpu", weights_only=False)[ + "model" + ] + if hasattr(net_d, "module"): + net_d.module.load_state_dict(ckpt) + else: + net_d.load_state_dict(ckpt) + del ckpt + except RuntimeError: + logger.error( # noqa: TRY400 + "The parameters of the pretrain model such as the sample rate or" + " architecture do not match the selected model.", + ) + sys.exit(1) + + # Initialize schedulers + scheduler_g = torch.optim.lr_scheduler.ExponentialLR( + optim_g, + gamma=config.train.lr_decay, + last_epoch=epoch_str - 2, + ) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR( + optim_d, + gamma=config.train.lr_decay, + last_epoch=epoch_str - 2, + ) + + use_scaler = device.type == "cuda" and train_dtype == torch.float16 + scaler = torch.amp.GradScaler(enabled=use_scaler) + if len(scaler_dict) > 0: + scaler.load_state_dict(scaler_dict) + + cache = [] + # collect the reference audio for tensorboard evaluation + if pathlib.Path( + os.path.join(RVC_TRAINING_MODELS_DIR, "reference", embedder_name, "feats.npy") + ).is_file(): + logger.info("Using %s reference set for validation", embedder_name) + phone = np.load( + os.path.join( + RVC_TRAINING_MODELS_DIR, + "reference", + embedder_name, + "feats.npy", + ), + ) + # expanding x2 to match pitch size + phone = np.repeat(phone, 2, axis=0) + phone_lengths = torch.LongTensor([phone.shape[0]]).to(device) + phone = torch.FloatTensor(phone).unsqueeze(0).to(device) + pitch = np.load( + os.path.join( + RVC_TRAINING_MODELS_DIR, + "reference", + "pitch_coarse.npy", + ), + ) + # removed last frame to match features + pitch = torch.LongTensor(pitch[:-1]).unsqueeze(0).to(device) + pitchf = np.load( + os.path.join( + RVC_TRAINING_MODELS_DIR, + "reference", + "pitch_fine.npy", + ), + ) + # removed last frame to match features + pitchf = torch.FloatTensor(pitchf[:-1]).unsqueeze(0).to(device) + sid = torch.LongTensor([0]).to(device) + reference = ( + phone, + phone_lengths, + pitch, + pitchf, + sid, + ) + else: + logger.info( + "No custom reference found, using a default audio sample for validation" + ) + info = next(iter(train_loader)) + phone, phone_lengths, pitch, pitchf, _, _, _, _, sid = info + reference = ( + phone.to(device), + phone_lengths.to(device), + pitch.to(device), + pitchf.to(device), + sid.to(device), + ) + if epoch_str > custom_total_epoch: + cleanup_training_processes(experiment_dir) + return + logger.info("Starting training...") + for epoch in range(epoch_str, custom_total_epoch + 1): + train_and_evaluate( + rank, + epoch, + config, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + [train_loader, None], + [writer_eval], + cache, + custom_save_every_weights, + custom_total_epoch, + device, + device_id, + reference, + fn_mel_loss, + model_name, + experiment_dir, + sample_rate, + vocoder, + save_every_epoch, + save_only_latest, + overtraining_detector, + overtraining_threshold, + cache_data_in_gpu, + global_gen_loss, + global_disc_loss, + scaler, + train_dtype, + ) + + +def train_and_evaluate( + rank, + epoch, + config, + nets, + optims, + schedulers, + loaders, + writers, + cache, + custom_save_every_weights, + custom_total_epoch, + device, + device_id, + reference, + fn_mel_loss, + model_name, + experiment_dir, + sample_rate, + vocoder, + save_every_epoch, + save_only_latest, + overtraining_detector, + overtraining_threshold, + cache_data_in_gpu, + global_gen_loss, + global_disc_loss, + scaler, + train_dtype, +) -> None: + """Train and evaluates the model for one epoch.""" + global global_step, lowest_g_value, lowest_d_value, consecutive_increases_gen, consecutive_increases_disc + + model_add = [] + checkpoint_idxs = [] + done = False + + net_g, net_d = nets + optim_g, optim_d = optims + scheduler_g, scheduler_d = schedulers + train_loader = loaders[0] if loaders is not None else None + if writers is not None: + writer = writers[0] + + train_loader.batch_sampler.set_epoch(epoch) + + net_g.train() + net_d.train() + + use_amp = device.type == "cuda" and (train_dtype in {torch.bfloat16, torch.float16}) + + # Data caching + if device.type == "cuda" and cache_data_in_gpu: + if cache == []: + for batch_idx, info in enumerate(train_loader): + # phone, phone_lengths, pitch, pitchf, spec, spec_lengths, wave, wave_lengths, sid + info = [tensor.cuda(device_id, non_blocking=True) for tensor in info] + cache.append((batch_idx, info)) + shuffle(cache) + data_iterator = cache + else: + data_iterator = enumerate(train_loader) + + epoch_recorder = EpochRecorder() + with tqdm(total=len(train_loader), leave=False) as pbar: + for batch_idx, info in data_iterator: + if device.type == "cuda" and not cache_data_in_gpu: + info = [tensor.cuda(device_id, non_blocking=True) for tensor in info] + elif device.type != "cuda": + info = [tensor.to(device) for tensor in info] + # else iterator is going thru a cached list with a device already assigned + + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + + with torch.amp.autocast( + device_type="cuda", enabled=use_amp, dtype=train_dtype + ): + # Forward pass + model_output = net_g( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + sid, + ) + y_hat, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q) = ( + model_output + ) + # slice of the original waveform to match a generate slice + if randomized: + wave = commons.slice_segments( + wave, + ids_slice * config.data.hop_length, + config.train.segment_size, + dim=3, + ) + for _ in range(d_step_per_g_step): # default x1 + with torch.amp.autocast( + device_type="cuda", enabled=use_amp, dtype=train_dtype + ): + y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) + loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g) + # Discriminator backward and update + global_disc_loss[epoch - 1] += loss_disc.item() + optim_d.zero_grad() + if device.type == "cuda" and train_dtype == torch.float16: + scaler.scale(loss_disc).backward() + scaler.unscale_(optim_d) + grad_norm_d = commons.grad_norm(net_d.parameters()) + scaler.step(optim_d) + else: + loss_disc.backward() + grad_norm_d = commons.grad_norm(net_d.parameters()) + optim_d.step() + + with torch.amp.autocast( + device_type="cuda", enabled=use_amp, dtype=train_dtype + ): + # Generator backward and update + _, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) + + if vocoder == "RefineGAN": + loss_mel = fn_mel_loss(wave, y_hat) * config.train.c_mel / 3.0 + else: + wave_mel = mel_spectrogram_torch( + wave.float().squeeze(1), + config.data.filter_length, + config.data.n_mel_channels, + config.data.sample_rate, + config.data.hop_length, + config.data.win_length, + config.data.mel_fmin, + config.data.mel_fmax, + ) + y_hat_mel = mel_spectrogram_torch( + y_hat.float().squeeze(1), + config.data.filter_length, + config.data.n_mel_channels, + config.data.sample_rate, + config.data.hop_length, + config.data.win_length, + config.data.mel_fmin, + config.data.mel_fmax, + ) + loss_mel = fn_mel_loss(wave_mel, y_hat_mel) * config.train.c_mel + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, _ = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + global_gen_loss[epoch - 1] += loss_gen_all.item() + optim_g.zero_grad() + if device.type == "cuda" and train_dtype == torch.float16: + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.grad_norm(net_g.parameters()) + scaler.step(optim_g) + scaler.update() + else: + loss_gen_all.backward() + grad_norm_g = commons.grad_norm(net_g.parameters()) + optim_g.step() + + global_step += 1 + + # queue for rolling losses over 50 steps + avg_losses["grad_d_50"].append(grad_norm_d) + avg_losses["grad_g_50"].append(grad_norm_g) + avg_losses["disc_loss_50"].append(loss_disc.detach()) + avg_losses["adv_loss_50"].append(loss_gen.detach()) + avg_losses["fm_loss_50"].append(loss_fm.detach()) + avg_losses["kl_loss_50"].append(loss_kl.detach()) + avg_losses["mel_loss_50"].append(loss_mel.detach()) + avg_losses["gen_loss_50"].append(loss_gen_all.detach()) + + if rank == 0 and global_step % 50 == 0: + # logging rolling averages + scalar_dict = { + "grad_avg_50/norm_d": ( + sum(avg_losses["grad_d_50"]) / len(avg_losses["grad_d_50"]) + ), + "grad_avg_50/norm_g": ( + sum(avg_losses["grad_g_50"]) / len(avg_losses["grad_g_50"]) + ), + "loss_avg_50/d/adv": torch.mean( + torch.stack(list(avg_losses["disc_loss_50"])), + ), + "loss_avg_50/g/adv": torch.mean( + torch.stack(list(avg_losses["adv_loss_50"])), + ), + "loss_avg_50/g/fm": torch.mean( + torch.stack(list(avg_losses["fm_loss_50"])), + ), + "loss_avg_50/g/kl": torch.mean( + torch.stack(list(avg_losses["kl_loss_50"])), + ), + "loss_avg_50/g/mel": torch.mean( + torch.stack(list(avg_losses["mel_loss_50"])), + ), + "loss_avg_50/g/total": torch.mean( + torch.stack(list(avg_losses["gen_loss_50"])), + ), + } + summarize( + writer=writer, + global_step=global_step, + scalars=scalar_dict, + ) + + pbar.update(1) + # end of batch train + # end of tqdm + scheduler_d.step() + scheduler_g.step() + + with torch.no_grad(): + torch.cuda.empty_cache() + # Logging and checkpointing + if rank == 0: + avg_global_disc_loss = global_disc_loss[epoch - 1] / len(train_loader.dataset) + avg_global_gen_loss = global_gen_loss[epoch - 1] / len(train_loader.dataset) + + min_delta = 0.004 + + if avg_global_disc_loss < lowest_d_value["value"] - min_delta: + lowest_d_value = {"value": avg_global_disc_loss, "epoch": epoch} + consecutive_increases_disc = 0 + else: + consecutive_increases_disc += 1 + + if avg_global_gen_loss < lowest_g_value["value"] - min_delta: + logger.info( + "New best epoch %d with average generator loss %.3f and discriminator" + " loss %.3f", + epoch, + avg_global_gen_loss, + avg_global_disc_loss, + ) + lowest_g_value = {"value": avg_global_gen_loss, "epoch": epoch} + consecutive_increases_gen = 0 + model_add.append( + os.path.join(experiment_dir, f"{model_name}_best.pth"), + ) + else: + consecutive_increases_gen += 1 + + # used for tensorboard chart - all/mel + mel = spec_to_mel_torch( + spec, + config.data.filter_length, + config.data.n_mel_channels, + config.data.sample_rate, + config.data.mel_fmin, + config.data.mel_fmax, + ) + # used for tensorboard chart - slice/mel_org + if randomized: + y_mel = commons.slice_segments( + mel, + ids_slice, + config.train.segment_size // config.data.hop_length, + dim=3, + ) + else: + y_mel = mel + # used for tensorboard chart - slice/mel_gen + y_hat_mel = mel_spectrogram_torch( + y_hat.float().squeeze(1), + config.data.filter_length, + config.data.n_mel_channels, + config.data.sample_rate, + config.data.hop_length, + config.data.win_length, + config.data.mel_fmin, + config.data.mel_fmax, + ) + + lr = optim_g.param_groups[0]["lr"] + + scalar_dict = { + "loss/g/total": loss_gen_all, + "loss/d/adv": loss_disc, + "learning_rate": lr, + "grad/norm_d": grad_norm_d, + "grad/norm_g": grad_norm_g, + "loss/g/adv": loss_gen, + "loss/g/fm": loss_fm, + "loss/g/mel": loss_mel, + "loss/g/kl": loss_kl, + } + + image_dict = { + "slice/mel_org": plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), + "slice/mel_gen": plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), + "all/mel": plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), + } + overtrain_info = "" + # Print training progress + lowest_g_value_rounded = float(lowest_g_value["value"]) + lowest_g_value_rounded = round(lowest_g_value_rounded, 3) + + record = f"{model_name} | epoch={epoch} | {epoch_recorder.record()}" + record += ( + f" | best avg-gen-loss={lowest_g_value_rounded:.3f} (epoch" + f" {lowest_g_value['epoch']})" + ) + # Check overtraining + if overtraining_detector: + overtrain_info = ( + f"Average epoch generator loss {avg_global_gen_loss:.3f} and" + f" discriminator loss {avg_global_disc_loss:.3f}" + ) + + remaining_epochs_gen = max( + overtraining_threshold - consecutive_increases_gen, + 0, + ) + remaining_epochs_disc = max( + overtraining_threshold * 2 - consecutive_increases_disc, + 0, + ) + record += ( + " | overtrain countdown: g=" + f"{remaining_epochs_gen},d={remaining_epochs_disc} |" + f" avg-gen-loss={avg_global_gen_loss:.3f} | avg-" + f"disc-loss={avg_global_disc_loss:.3f}" + ) + + if remaining_epochs_disc == 0 or remaining_epochs_gen == 0: + record += ( + f"\nOvertraining detected at epoch {epoch} with average" + f" generator loss {avg_global_gen_loss:.3f} and discriminator loss" + f" {avg_global_disc_loss:.3f}" + ) + done = True + print(record) + + # Save weights, checkpoints and reference inference results + # every N epochs + if epoch % save_every_epoch == 0: + with ( + torch.amp.autocast( + device_type="cuda", enabled=use_amp, dtype=train_dtype + ), + torch.no_grad(), + ): + if hasattr(net_g, "module"): + o, *_ = net_g.module.infer(*reference) + else: + o, *_ = net_g.infer(*reference) + audio_dict = {f"gen/audio_{global_step:07d}": o[0, :, :]} + summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + audios=audio_dict, + audio_sample_rate=config.data.sample_rate, + ) + checkpoint_idxs.append(2333333) + if not save_only_latest: + checkpoint_idxs.append(epoch) + + if custom_save_every_weights: + model_add.append( + os.path.join(experiment_dir, f"{model_name}_{epoch}.pth"), + ) + else: + summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + ) + for idx in checkpoint_idxs: + save_checkpoint( + net_g, + optim_g, + config.train.learning_rate, + epoch, + lowest_g_value, + consecutive_increases_gen, + os.path.join(experiment_dir, f"G_{idx}.pth"), + scaler, + ) + save_checkpoint( + net_d, + optim_d, + config.train.learning_rate, + epoch, + lowest_d_value, + consecutive_increases_disc, + os.path.join(experiment_dir, f"D_{idx}.pth"), + scaler, + ) + if model_add: + ckpt = ( + net_g.module.state_dict() + if hasattr(net_g, "module") + else net_g.state_dict() + ) + for m in model_add: + extract_model( + ckpt=ckpt, + sr=sample_rate, + name=model_name, + model_path=m, + epoch=epoch, + step=global_step, + hps=config, + overtrain_info=overtrain_info, + vocoder=vocoder, + ) + # Check completion + if epoch >= custom_total_epoch: + lowest_g_value_rounded = float(lowest_g_value["value"]) + lowest_g_value_rounded = round(lowest_g_value_rounded, 3) + print( + f"Training has been successfully completed with {epoch} epoch(s)," + f" {global_step} step(s) and {round(avg_global_gen_loss, 3)} average" + " generator loss.", + ) + print( + f"Lowest average generator loss: {lowest_g_value_rounded} at epoch" + f" {lowest_g_value['epoch']}", + ) + + done = True + with torch.no_grad(): + torch.cuda.empty_cache() + if done: + cleanup_training_processes(experiment_dir) + os._exit(0) + + +def cleanup_training_processes(experiment_dir) -> None: + dist.destroy_process_group() + pid_file_path = os.path.join(experiment_dir, "config.json") + with pathlib.Path(pid_file_path).open() as pid_file: + pid_data = json.load(pid_file) + with pathlib.Path(pid_file_path).open("w") as pid_file: + pid_data.pop("process_pids", None) + json.dump(pid_data, pid_file, indent=4) +dump(pid_data, pid_file, indent=4) diff --git a/rvc_logic/rvc/train/utils.py b/rvc_logic/rvc/train/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e70ff7831d4ae620d0f05f06c28a065dc320c89e --- /dev/null +++ b/rvc_logic/rvc/train/utils.py @@ -0,0 +1,335 @@ +from typing import TYPE_CHECKING + +import lazy_loader as lazy + +import glob +import logging +import os +import pathlib +from collections import OrderedDict + +import matplotlib.pyplot as plt +import numpy as np + +import torch + +import soundfile as sf + +if TYPE_CHECKING: + import static_sox.run as static_sox_run +else: + static_sox_run = lazy.load("static_sox.run") + +logger = logging.getLogger(__name__) + +MATPLOTLIB_FLAG = False + + +def remove_from_ld_preload(prefix: str) -> None: + """ + Remove entries from the LD_PRELOAD environment variable that start + with the given prefix. + + Parameters + ---------- + prefix : str + The prefix to match entries in LD_PRELOAD. + + """ + # Get the current LD_PRELOAD value + ld_preload = os.environ.get("LD_PRELOAD", "") + + # Split the LD_PRELOAD into a list of entries + preload_entries = ld_preload.split(os.pathsep) + + # Remove the entries that start with the given prefix + preload_entries = [ + entry for entry in preload_entries if not entry.startswith(prefix) + ] + + # Join the list back into a string and update LD_PRELOAD + os.environ["LD_PRELOAD"] = os.pathsep.join(preload_entries) + + +def remove_sox_libmso6_from_ld_preload() -> None: + """ + Remove the sox `libm.so.6` library from the `LD_PRELOAD` environment + variable. + + On ubuntu 24.04 the static_sox module does not work with + multiprocessing using the spawn method due to a "version + GLIBC_2.38 not found" error. This function fixes that by + removing the path to the `libm.so.6` library from the `LD_PRELOAD` + environment variable. + + """ + sox_exe = static_sox_run.get_or_fetch_platform_executables_else_raise() + remove_from_ld_preload(os.path.join(os.path.dirname(sox_exe), "libm.so.6")) + + +def replace_keys_in_dict(d, old_key_part, new_key_part): + """ + Recursively replace parts of the keys in a dictionary. + + Args: + d (dict or OrderedDict): The dictionary to update. + old_key_part (str): The part of the key to replace. + new_key_part (str): The new part of the key. + + """ + updated_dict = OrderedDict() if isinstance(d, OrderedDict) else {} + for key, value in d.items(): + new_key = ( + key.replace(old_key_part, new_key_part) if isinstance(key, str) else key + ) + updated_dict[new_key] = ( + replace_keys_in_dict(value, old_key_part, new_key_part) + if isinstance(value, dict) + else value + ) + return updated_dict + + +def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): + """ + Load a checkpoint into a model and optionally the optimizer. + + Args: + checkpoint_path (str): Path to the checkpoint file. + model (torch.nn.Module): The model to load the checkpoint into. + optimizer (torch.optim.Optimizer, optional): The optimizer to load the state from. Defaults to None. + load_opt (int, optional): Whether to load the optimizer state. Defaults to 1. + + """ + assert pathlib.Path( + checkpoint_path + ).is_file(), f"Checkpoint file not found: {checkpoint_path}" + + checkpoint_dict = torch.load( + checkpoint_path, + map_location="cpu", + weights_only=False, + ) + checkpoint_dict = replace_keys_in_dict( + replace_keys_in_dict( + checkpoint_dict, + ".weight_v", + ".parametrizations.weight.original1", + ), + ".weight_g", + ".parametrizations.weight.original0", + ) + + # Update model state_dict + model_state_dict = ( + model.module.state_dict() if hasattr(model, "module") else model.state_dict() + ) + new_state_dict = { + k: checkpoint_dict["model"].get(k, v) for k, v in model_state_dict.items() + } + + # Load state_dict into model + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict, strict=False) + else: + model.load_state_dict(new_state_dict, strict=False) + + if optimizer and load_opt == 1: + optimizer.load_state_dict(checkpoint_dict.get("optimizer", {})) + + logger.info( + "Loaded checkpoint '%s' (epoch %d)", + checkpoint_path, + checkpoint_dict["iteration"], + ) + return ( + model, + optimizer, + checkpoint_dict.get("learning_rate", 0), + checkpoint_dict["iteration"], + checkpoint_dict.get("lowest_value", {"value": float("inf"), "epoch": 0}), + checkpoint_dict.get("consecutive_increases", 0), + checkpoint_dict.get("scaler", {}), + ) + + +def save_checkpoint( + model, + optimizer, + learning_rate, + iteration, + lowest_value, + consecutive_increases, + checkpoint_path, + scaler, +): + """ + Save the model and optimizer state to a checkpoint file. + + Args: + model (torch.nn.Module): The model to save. + optimizer (torch.optim.Optimizer): The optimizer to save the state of. + learning_rate (float): The current learning rate. + iteration (int): The current iteration. + checkpoint_path (str): The path to save the checkpoint to. + + """ + state_dict = ( + model.module.state_dict() if hasattr(model, "module") else model.state_dict() + ) + checkpoint_data = { + "model": state_dict, + "iteration": iteration, + "lowest_value": lowest_value, + "consecutive_increases": consecutive_increases, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + "scaler": scaler.state_dict(), + } + + # Create a backwards-compatible checkpoint + checkpoint_data = replace_keys_in_dict( + replace_keys_in_dict( + checkpoint_data, + ".parametrizations.weight.original1", + ".weight_v", + ), + ".parametrizations.weight.original0", + ".weight_g", + ) + torch.save(checkpoint_data, checkpoint_path) + logger.info("Saved model '%s' (epoch %d)", checkpoint_path, iteration) + + +def summarize( + writer, + global_step, + scalars={}, + histograms={}, + images={}, + audios={}, + audio_sample_rate=22050, +): + """ + Log various summaries to a TensorBoard writer. + + Args: + writer (SummaryWriter): The TensorBoard writer. + global_step (int): The current global step. + scalars (dict, optional): Dictionary of scalar values to log. + histograms (dict, optional): Dictionary of histogram values to log. + images (dict, optional): Dictionary of image values to log. + audios (dict, optional): Dictionary of audio values to log. + audio_sample_rate (int, optional): Sampling rate of the audio data. + + """ + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats="HWC") + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sample_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + """ + Get the latest checkpoint file in a directory. + + Args: + dir_path (str): The directory to search for checkpoints. + regex (str, optional): The regular expression to match checkpoint files. + + """ + checkpoints = sorted( + glob.glob(os.path.join(dir_path, regex)), + key=lambda f: int("".join(filter(str.isdigit, f))), + ) + return checkpoints[-1] if checkpoints else None + + +def plot_spectrogram_to_numpy(spectrogram): + """ + Convert a spectrogram to a NumPy array for visualization. + + Args: + spectrogram (numpy.ndarray): The spectrogram to plot. + + """ + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + plt.switch_backend("Agg") + MATPLOTLIB_FLAG = True + + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.asarray(fig.canvas.renderer.buffer_rgba())[:, :, :3] + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close(fig) + return data + + +def load_wav_to_torch(full_path): + """ + Load a WAV file into a PyTorch tensor. + + Args: + full_path (str): The path to the WAV file. + + """ + data, sample_rate = sf.read(full_path, dtype="float32") + return torch.FloatTensor(data), sample_rate + + +def load_filepaths_and_text(filename, split="|"): + """ + Load filepaths and associated text from a file. + + Args: + filename (str): The path to the file. + split (str, optional): The delimiter used to split the lines. + + """ + with pathlib.Path(filename).open(encoding="utf-8") as f: + return [line.strip().split(split) for line in f] + + +class HParams: + """ + A class for storing and accessing hyperparameters. + """ + + def __init__(self, **kwargs): + for k, v in kwargs.items(): + self[k] = HParams(**v) if isinstance(v, dict) else v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return self.__dict__[key] + + def __setitem__(self, key, value): + self.__dict__[key] = value + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return repr(self.__dict__) diff --git a/rvc_logic/typing_extra.py b/rvc_logic/typing_extra.py new file mode 100644 index 0000000000000000000000000000000000000000..865d6907f52394a42a89d5d5a9165b6d6a016f5c --- /dev/null +++ b/rvc_logic/typing_extra.py @@ -0,0 +1,154 @@ +"""Extra typing for the Ultimate RVC project.""" + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from enum import IntEnum, StrEnum +from os import PathLike + +type StrPath = str | PathLike[str] + +type Json = Mapping[str, Json] | Sequence[Json] | str | int | float | bool | None + + +class SeparationModel(StrEnum): + """Enumeration of audio separation models.""" + + UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx" + UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx" + REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx" + + +class SegmentSize(IntEnum): + """Enumeration of segment sizes for audio separation.""" + + SEG_64 = 64 + SEG_128 = 128 + SEG_256 = 256 + SEG_512 = 512 + SEG_1024 = 1024 + SEG_2048 = 2048 + + +class F0Method(StrEnum): + """Enumeration of pitch extraction methods.""" + + RMVPE = "rmvpe" + CREPE = "crepe" + CREPE_TINY = "crepe-tiny" + FCPE = "fcpe" + + +class EmbedderModel(StrEnum): + """Enumeration of audio embedding models.""" + + CONTENTVEC = "contentvec" + SPIN = "spin" + SPIN_V2 = "spin-v2" + CHINESE_HUBERT_BASE = "chinese-hubert-base" + JAPANESE_HUBERT_BASE = "japanese-hubert-base" + KOREAN_HUBERT_BASE = "korean-hubert-base" + CUSTOM = "custom" + + +class RVCContentType(StrEnum): + """Enumeration of valid content to convert with RVC.""" + + VOCALS = "vocals" + VOICE = "voice" + SPEECH = "speech" + AUDIO = "audio" + + +class SampleRate(IntEnum): + """Enumeration of supported audio sample rates.""" + + HZ_16K = 16000 + HZ_44K = 44100 + HZ_48K = 48000 + HZ_96K = 96000 + HZ_192K = 192000 + + +class AudioExt(StrEnum): + """Enumeration of supported audio file formats.""" + + MP3 = "mp3" + WAV = "wav" + FLAC = "flac" + OGG = "ogg" + M4A = "m4a" + AAC = "aac" + + +class DeviceType(StrEnum): + """Enumeration of device types for training voice models.""" + + AUTOMATIC = "Automatic" + CPU = "CPU" + GPU = "GPU" + + +class PrecisionType(StrEnum): + """Enumeration of precision types for training voice models.""" + + FP32 = "fp32" + FP16 = "fp16" + BF16 = "bf16" + + +class TrainingSampleRate(IntEnum): + """Enumeration of sample rates for training voice models.""" + + HZ_32K = 32000 + HZ_40K = 40000 + HZ_48K = 48000 + + +class AudioSplitMethod(StrEnum): + """ + Enumeration of methods to use for splitting audio files during + dataset preprocessing. + """ + + SKIP = "Skip" + SIMPLE = "Simple" + AUTOMATIC = "Automatic" + + +class AudioNormalizationMode(StrEnum): + """ + Enumeration of audio normalization methods during + dataset preprocessing. + """ + + NONE = "none" + PRE = "pre" + POST = "post" + + +class Vocoder(StrEnum): + """Enumeration of vocoders for training voice models.""" + + HIFI_GAN = "HiFi-GAN" + MRF_HIFI_GAN = "MRF HiFi-GAN" + REFINE_GAN = "RefineGAN" + + +class IndexAlgorithm(StrEnum): + """Enumeration of indexing algorithms for training voice models.""" + + AUTO = "Auto" + FAISS = "Faiss" + KMEANS = "KMeans" + + +class PretrainedType(StrEnum): + """ + Enumeration of the possible types of pretrained models to finetune + voice models on. + """ + + NONE = "None" + DEFAULT = "Default" + CUSTOM = "Custom"