assistanttttttt commited on
Commit
d8651ae
·
1 Parent(s): cc77887

Fix build error: use local rvc_logic (cleaned)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +6 -6
  2. pipeline/rvc_training.py +14 -16
  3. requirements.txt +2 -2
  4. rvc_logic/common.py +37 -0
  5. rvc_logic/core_train/__init__.py +4 -0
  6. rvc_logic/core_train/common.py +105 -0
  7. rvc_logic/core_train/extract.py +146 -0
  8. rvc_logic/core_train/prepare.py +205 -0
  9. rvc_logic/core_train/train.py +369 -0
  10. rvc_logic/core_train/typing_extra.py +43 -0
  11. rvc_logic/rvc/__init__.py +4 -0
  12. rvc_logic/rvc/common.py +9 -0
  13. rvc_logic/rvc/configs/32000.json +75 -0
  14. rvc_logic/rvc/configs/40000.json +75 -0
  15. rvc_logic/rvc/configs/48000.json +75 -0
  16. rvc_logic/rvc/configs/config.py +105 -0
  17. rvc_logic/rvc/infer/infer.py +528 -0
  18. rvc_logic/rvc/infer/pipeline.py +581 -0
  19. rvc_logic/rvc/infer/typing_extra.py +58 -0
  20. rvc_logic/rvc/lib/algorithm/__init__.py +0 -0
  21. rvc_logic/rvc/lib/algorithm/attentions.py +258 -0
  22. rvc_logic/rvc/lib/algorithm/commons.py +151 -0
  23. rvc_logic/rvc/lib/algorithm/discriminators.py +267 -0
  24. rvc_logic/rvc/lib/algorithm/encoders.py +228 -0
  25. rvc_logic/rvc/lib/algorithm/generators/__init__.py +0 -0
  26. rvc_logic/rvc/lib/algorithm/generators/hifigan.py +249 -0
  27. rvc_logic/rvc/lib/algorithm/generators/hifigan_mrf.py +411 -0
  28. rvc_logic/rvc/lib/algorithm/generators/hifigan_nsf.py +258 -0
  29. rvc_logic/rvc/lib/algorithm/generators/refinegan.py +462 -0
  30. rvc_logic/rvc/lib/algorithm/modules.py +120 -0
  31. rvc_logic/rvc/lib/algorithm/normalization.py +31 -0
  32. rvc_logic/rvc/lib/algorithm/residuals.py +271 -0
  33. rvc_logic/rvc/lib/algorithm/synthesizers.py +251 -0
  34. rvc_logic/rvc/lib/predictors/F0Extractor.py +111 -0
  35. rvc_logic/rvc/lib/predictors/FCPE.py +965 -0
  36. rvc_logic/rvc/lib/predictors/RMVPE.py +604 -0
  37. rvc_logic/rvc/lib/predictors/f0.py +92 -0
  38. rvc_logic/rvc/lib/tools/analyzer.py +77 -0
  39. rvc_logic/rvc/lib/tools/gdown.py +307 -0
  40. rvc_logic/rvc/lib/tools/launch_tensorboard.py +23 -0
  41. rvc_logic/rvc/lib/tools/model_download.py +238 -0
  42. rvc_logic/rvc/lib/tools/prerequisites_download.py +198 -0
  43. rvc_logic/rvc/lib/tools/pretrained_selector.py +16 -0
  44. rvc_logic/rvc/lib/tools/split_audio.py +89 -0
  45. rvc_logic/rvc/lib/tools/tts.py +30 -0
  46. rvc_logic/rvc/lib/tools/tts_voices.json +0 -0
  47. rvc_logic/rvc/lib/utils.py +194 -0
  48. rvc_logic/rvc/lib/zluda.py +85 -0
  49. rvc_logic/rvc/train/anyprecision_optimizer.py +185 -0
  50. rvc_logic/rvc/train/data_utils.py +396 -0
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: purple
5
  colorTo: blue
6
  sdk: gradio
7
  sdk_version: 5.12.0
8
- python_version: "3.12"
9
  app_file: app.py
10
  pinned: false
11
  license: mit
@@ -25,17 +25,17 @@ Outil web de **clonage vocal zero-shot** basé sur **Seed-VC** (Diffusion Transf
25
 
26
  ## Fonctionnalités
27
 
28
- 1. **Référence vocale** : Uploadez un court extrait de votre voix (3-30 sec) — pas d'entraînement nécessaire
29
- 2. **Séparation audio** : Séparation automatique voix/instruments via Demucs (Meta AI)
30
  3. **Conversion vocale** : Remplacement de la voix originale par la vôtre (Seed-VC zero-shot)
31
- 4. **Mixage final** : Remixage automatique voix convertie + instruments originaux
32
  5. **Export** : Téléchargement du résultat en WAV 44.1kHz 16-bit
33
 
34
  ## Comment utiliser
35
 
36
- ### Étape 1 : Enregistrer votre référence vocale
37
  1. Onglet **"Ma voix"**
38
- 2. Uploadez un extrait de votre voix (WAV ou MP3, 3 à 30 secondes)
39
  3. Donnez un nom (ex: `ma_voix`)
40
  4. Cliquez **"Sauvegarder"**
41
 
 
5
  colorTo: blue
6
  sdk: gradio
7
  sdk_version: 5.12.0
8
+ python_version: "3.10"
9
  app_file: app.py
10
  pinned: false
11
  license: mit
 
25
 
26
  ## Fonctionnalités
27
 
28
+ 1. **Référence vocale** : Uploadez un court extrait de votre voz (3-30 sec) — pas d'entraînement nécessaire
29
+ 2. **Séparation audio** : Séparation automatique voz/instruments via Demucs (Meta AI)
30
  3. **Conversion vocale** : Remplacement de la voix originale par la vôtre (Seed-VC zero-shot)
31
+ 4. **Mixage final** : Remixage automatique voz convertie + instruments originaux
32
  5. **Export** : Téléchargement du résultat en WAV 44.1kHz 16-bit
33
 
34
  ## Comment utiliser
35
 
36
+ ### Étape 1 : Enregistrer votre referencia vocale
37
  1. Onglet **"Ma voix"**
38
+ 2. Uploadez un extrait de votre voz (WAV ou MP3, 3 à 30 secondes)
39
  3. Donnez un nom (ex: `ma_voix`)
40
  4. Cliquez **"Sauvegarder"**
41
 
pipeline/rvc_training.py CHANGED
@@ -17,25 +17,24 @@ except ImportError:
17
  return fn
18
  return decorator
19
 
20
- # Configuration for Ultimate-RVC paths
21
- # We set these environment variables BEFORE importing ultimate_rvc to ensure it uses our paths
22
  os.environ["URVC_MODELS_DIR"] = os.path.abspath("rvc_models")
23
  os.environ["URVC_AUDIO_DIR"] = os.path.abspath("rvc_audio")
24
  os.environ["URVC_TEMP_DIR"] = os.path.abspath("rvc_temp")
25
 
26
- # Now we can import the core functions from ultimate_rvc
27
  try:
28
- from ultimate_rvc.core.train import prepare, extract, train
29
- from ultimate_rvc.typing_extra import TrainingSampleRate, F0Method, EmbedderModel
30
- ULTIMATE_RVC_AVAILABLE = True
31
  except ImportError as e:
32
- logger.error(f"Failed to import ultimate_rvc: {e}")
33
- ULTIMATE_RVC_AVAILABLE = False
34
 
35
- @spaces.GPU(duration=1000) # Training takes time, let's request more
36
  def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
37
- if not ULTIMATE_RVC_AVAILABLE:
38
- return "Error: ultimate-rvc library not installed correctly.", None
39
 
40
  if not audio_path:
41
  return "Error: Please upload an audio file.", None
@@ -71,13 +70,12 @@ def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
71
  )
72
 
73
  # 4. Train
74
- p(0.6, f"Step 4/4: Training for {epochs} epochs (this may take several minutes)...")
75
- # ultimate-rvc's run_training returns [pth_path, index_path]
76
  result_paths = train.run_training(
77
  model_name=model_name,
78
  num_epochs=epochs,
79
- batch_size=4, # Safe for ZeroGPU
80
- save_interval=epochs # Only save at the end
81
  )
82
 
83
  if not result_paths or len(result_paths) < 2:
@@ -107,7 +105,7 @@ def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
107
  )
108
  except Exception as e:
109
  logger.error(f"Upload to dataset failed: {e}")
110
- return f"Model trained but failed to upload to HF: {e}. Files are at {pth_path}", pth_path
111
 
112
  return f"Successfully trained and uploaded to dataset {repo_id}!", pth_path
113
 
 
17
  return fn
18
  return decorator
19
 
20
+ # Configuration for paths
 
21
  os.environ["URVC_MODELS_DIR"] = os.path.abspath("rvc_models")
22
  os.environ["URVC_AUDIO_DIR"] = os.path.abspath("rvc_audio")
23
  os.environ["URVC_TEMP_DIR"] = os.path.abspath("rvc_temp")
24
 
25
+ # Import the core functions from our LOCAL rvc_logic
26
  try:
27
+ from rvc_logic.core_train import prepare, extract, train
28
+ from rvc_logic.typing_extra import TrainingSampleRate, F0Method, EmbedderModel
29
+ RVC_LOGIC_AVAILABLE = True
30
  except ImportError as e:
31
+ logger.error(f"Failed to import rvc_logic: {e}")
32
+ RVC_LOGIC_AVAILABLE = False
33
 
34
+ @spaces.GPU(duration=1000)
35
  def train_rvc_model(audio_path, model_name, epochs=100, progress=None):
36
+ if not RVC_LOGIC_AVAILABLE:
37
+ return "Error: rvc_logic module not found in the project.", None
38
 
39
  if not audio_path:
40
  return "Error: Please upload an audio file.", None
 
70
  )
71
 
72
  # 4. Train
73
+ p(0.6, f"Step 4/4: Training for {epochs} epochs...")
 
74
  result_paths = train.run_training(
75
  model_name=model_name,
76
  num_epochs=epochs,
77
+ batch_size=4,
78
+ save_interval=epochs
79
  )
80
 
81
  if not result_paths or len(result_paths) < 2:
 
105
  )
106
  except Exception as e:
107
  logger.error(f"Upload to dataset failed: {e}")
108
+ return f"Model trained but failed to upload to HF: {e}", pth_path
109
 
110
  return f"Successfully trained and uploaded to dataset {repo_id}!", pth_path
111
 
requirements.txt CHANGED
@@ -33,9 +33,9 @@ bigvgan
33
  descript-audio-codec
34
  vocos
35
 
36
- # RVC Training via Ultimate-RVC
37
- ultimate-rvc==0.6.0
38
  torchcrepe
39
  torchfcpe
 
40
  tensorboardX
41
  wget
 
33
  descript-audio-codec
34
  vocos
35
 
36
+ # RVC Training dependencies (No fairseq, using transformers/faiss/crepe)
 
37
  torchcrepe
38
  torchfcpe
39
+ faiss-cpu
40
  tensorboardX
41
  wget
rvc_logic/common.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Common variables used in the Ultimate RVC project."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ BASE_DIR = Path.cwd()
10
+ VENV_DIR = Path(sys.prefix)
11
+ MODELS_DIR = Path(os.getenv("URVC_MODELS_DIR") or BASE_DIR / "models")
12
+ RVC_MODELS_DIR = MODELS_DIR / "rvc"
13
+ VOICE_MODELS_DIR = Path(
14
+ os.getenv("URVC_VOICE_MODELS_DIR") or RVC_MODELS_DIR / "voice_models",
15
+ )
16
+ EMBEDDER_MODELS_DIR = RVC_MODELS_DIR / "embedders"
17
+ CUSTOM_EMBEDDER_MODELS_DIR = EMBEDDER_MODELS_DIR / "custom"
18
+ PRETRAINED_MODELS_DIR = RVC_MODELS_DIR / "pretraineds"
19
+ CUSTOM_PRETRAINED_MODELS_DIR = PRETRAINED_MODELS_DIR / "custom"
20
+
21
+ SEPARATOR_MODELS_DIR = MODELS_DIR / "audio_separator"
22
+ TRAINING_MODELS_DIR = RVC_MODELS_DIR / "training"
23
+ AUDIO_DIR = Path(os.getenv("URVC_AUDIO_DIR") or BASE_DIR / "audio")
24
+ TEMP_DIR = Path(os.getenv("URVC_TEMP_DIR") or BASE_DIR / "temp")
25
+ CONFIG_DIR = Path(os.getenv("URVC_CONFIG_DIR") or BASE_DIR / "config")
26
+ NODE_PATH = Path(
27
+ (
28
+ os.getenv("GRADIO_NODE_PATH")
29
+ or (
30
+ VENV_DIR
31
+ / f"lib/python{sys.version_info.major}.{sys.version_info.minor}"
32
+ / "site-packages/nodejs_wheel/bin/node"
33
+ )
34
+ if sys.platform == "linux"
35
+ else VENV_DIR / "Lib/site-packages/nodejs_wheel/node.exe"
36
+ ),
37
+ )
rvc_logic/core_train/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Package which exposes definitions facilitating the training of voice
3
+ conversion models.
4
+ """
rvc_logic/core_train/common.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Common definitions for modules in the Ultimate RVC project that
3
+ facilitate training voice models.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import Literal
9
+
10
+ from rvc_logic.core.exceptions import (
11
+ Entity,
12
+ GPUNotFoundError,
13
+ NotProvidedError,
14
+ UIMessage,
15
+ )
16
+ from rvc_logic.typing_extra import DeviceType
17
+
18
+
19
+ def get_gpu_info() -> list[tuple[str, int]]:
20
+ """
21
+ Retrieve information on locally available GPUs.
22
+
23
+ Returns
24
+ -------
25
+ list[tuple[str, int]]
26
+ A list of tuples containing the name and index of each locally
27
+ available GPU.
28
+
29
+ """
30
+ # NOTE lazy importing does not work with torch so we import it here
31
+ # manually
32
+ import torch # noqa: PLC0415
33
+
34
+ ngpu = torch.cuda.device_count()
35
+ gpu_infos: list[tuple[str, int]] = []
36
+ if torch.cuda.is_available() or ngpu != 0:
37
+ for i in range(ngpu):
38
+ gpu_name = torch.cuda.get_device_name(i)
39
+ mem = int(
40
+ torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 # type: ignore[ReportUnknownMembershipType]
41
+ + 0.4,
42
+ )
43
+ gpu_infos.append((f"{gpu_name} ({mem} GB)", i))
44
+ return gpu_infos
45
+
46
+
47
+ def validate_devices(
48
+ device_type: DeviceType,
49
+ device_ids: set[int] | None = None,
50
+ ) -> tuple[Literal["cuda", "cpu"], set[int] | None]:
51
+ """
52
+ Validate the devices identified by the provided device type and
53
+ device IDs.
54
+
55
+ If the provided device type is AUTOMATIC, the first available GPU
56
+ will be selected if available. Otherwise CPU will be selected.
57
+ If the device type is GPU, then validation will be performed to
58
+ ensure that at least one device ID is provided and that all device
59
+ IDs point to available GPUs. If the device type is CPU, then no
60
+ validation is performed.
61
+
62
+ Parameters
63
+ ----------
64
+ device_type : DeviceType
65
+ The type of devices to validate.
66
+ device_ids : set[int], optional
67
+ The IDs of the devices to validate when device type is GPU.
68
+
69
+ Returns
70
+ -------
71
+ device_type : str
72
+ The type of the selected devices.
73
+ device_ids : set[int], optional
74
+ The ids of the selected devices. Only returned when the
75
+ device type is GPU or AUTOMATIC.
76
+
77
+ Raises
78
+ ------
79
+ NotProvidedError
80
+ If device type is GPU and no device IDs are provided.
81
+ GPUNotFoundError
82
+ If device type is GPU and a provided device ID does not point
83
+ to an available GPU.
84
+
85
+
86
+ """
87
+ match device_type:
88
+ case DeviceType.AUTOMATIC:
89
+ gpu_info = get_gpu_info()
90
+ if gpu_info:
91
+ return "cuda", {gpu_info[0][1]}
92
+ return "cpu", None
93
+ case DeviceType.GPU:
94
+ if not device_ids:
95
+ raise NotProvidedError(Entity.GPU_IDS, UIMessage.NO_GPUS)
96
+ validated_devices: list[int] = []
97
+ available_ids = {i for _, i in get_gpu_info()}
98
+ for device_id in device_ids:
99
+ if device_id not in available_ids:
100
+ raise GPUNotFoundError(device_id)
101
+ validated_devices.append(device_id)
102
+ return "cuda", set(validated_devices)
103
+ case DeviceType.CPU:
104
+ return "cpu", None
105
+ None
rvc_logic/core_train/extract.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module which exposes functionality for extracting training features from
3
+ audio datasets.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from multiprocessing import cpu_count
9
+
10
+ from rvc_logic.core.common import (
11
+ display_progress,
12
+ get_combined_file_hash,
13
+ validate_model,
14
+ )
15
+ from rvc_logic.core.exceptions import (
16
+ Entity,
17
+ ModelAsssociatedEntityNotFoundError,
18
+ Step,
19
+ )
20
+ from rvc_logic.core.train.common import validate_devices
21
+ from rvc_logic.typing_extra import (
22
+ DeviceType,
23
+ EmbedderModel,
24
+ F0Method,
25
+ )
26
+
27
+
28
+ def extract_features(
29
+ model_name: str,
30
+ f0_method: F0Method = F0Method.RMVPE,
31
+ embedder_model: EmbedderModel = EmbedderModel.CONTENTVEC,
32
+ custom_embedder_model: str | None = None,
33
+ include_mutes: int = 2,
34
+ cpu_cores: int = cpu_count(),
35
+ hardware_acceleration: DeviceType = DeviceType.AUTOMATIC,
36
+ gpu_ids: set[int] | None = None,
37
+ ) -> None:
38
+ """
39
+ Extract features from the preprocessed dataset associated with a
40
+ voice model to be trained.
41
+
42
+ Parameters
43
+ ----------
44
+ model_name : str
45
+ The name of the voice model to be trained.
46
+ f0_method : F0Method, defaultF0Method.RMVPE
47
+ The method to use for extracting pitch features.
48
+ embedder_model : EmbedderModel, default=EmbedderModel.CONTENTVEC
49
+ The model to use for extracting audio embeddings.
50
+ custom_embedder_model : StrPath, optional
51
+ The name of the custom embedder model to use for extracting
52
+ audio embeddings.
53
+ include_mutes : int, default=2
54
+ The number of mute audio files to include in the generated
55
+ training file list. Adding silent files enables the voice model
56
+ to handle pure silence in inferred audio files. If the
57
+ preprocessed audio dataset already contains segments of pure
58
+ silence, set this to 0.
59
+ cpu_cores : int, default=cpu_count()
60
+ The number of CPU cores to use for feature extraction.
61
+ hardware_acceleration : DeviceType, default=DeviceType.AUTOMATIC
62
+ The type of hardware acceleration to use for feature extraction.
63
+ `AUTOMATIC` will select the first available GPU and fall back to
64
+ CPU if no GPUs are available.
65
+ gpu_ids : set[int], optional
66
+ Set of ids of the GPUs to use for feature extraction when `GPU`
67
+ is selected for hardware acceleration.
68
+
69
+ Raises
70
+ ------
71
+ ModelAsssociatedEntityNotFoundError
72
+ If no preprocessed dataset audio files are associated with the
73
+ voice model identified by the provided name.
74
+
75
+ """
76
+ model_path = validate_model(model_name, Entity.TRAINING_MODEL)
77
+ sliced_audios16k_path = model_path / "sliced_audios_16k"
78
+ if not sliced_audios16k_path.is_dir() or not any(sliced_audios16k_path.iterdir()):
79
+ raise ModelAsssociatedEntityNotFoundError(
80
+ Entity.PREPROCESSED_AUDIO_DATASET_FILES,
81
+ model_name,
82
+ Step.DATASET_PREPROCESSING,
83
+ )
84
+
85
+ custom_embedder_model_path, combined_file_hash = None, None
86
+ chosen_embedder_model, embedder_model_id = [embedder_model] * 2
87
+ if embedder_model == EmbedderModel.CUSTOM:
88
+ custom_embedder_model_path = validate_model(
89
+ custom_embedder_model,
90
+ Entity.CUSTOM_EMBEDDER_MODEL,
91
+ )
92
+ json_file = custom_embedder_model_path / "config.json"
93
+ bin_path = custom_embedder_model_path / "pytorch_model.bin"
94
+
95
+ combined_file_hash = get_combined_file_hash([json_file, bin_path])
96
+ chosen_embedder_model = str(custom_embedder_model_path)
97
+ embedder_model_id = f"custom_{combined_file_hash}"
98
+
99
+ device_type, device_ids = validate_devices(hardware_acceleration, gpu_ids)
100
+
101
+ devices = (
102
+ [f"{device_type}:{device_id}" for device_id in device_ids]
103
+ if device_ids
104
+ else [device_type]
105
+ )
106
+ # NOTE The lazy_import function does not work with the package below
107
+ # so we import it here manually
108
+ from rvc_logic.rvc.train.extract import extract # noqa: PLC0415
109
+
110
+ file_infos = extract.initialize_extraction(
111
+ str(model_path),
112
+ f0_method,
113
+ embedder_model_id,
114
+ )
115
+ extract.update_model_info(
116
+ str(model_path),
117
+ chosen_embedder_model,
118
+ combined_file_hash,
119
+ )
120
+ display_progress("[~] Extracting pitch features...")
121
+ extract.run_pitch_extraction(file_infos, devices, f0_method, cpu_cores)
122
+ display_progress("[~] Extracting audio embeddings...")
123
+ extract.run_embedding_extraction(
124
+ file_infos,
125
+ devices,
126
+ embedder_model,
127
+ (
128
+ str(custom_embedder_model_path)
129
+ if custom_embedder_model_path is not None
130
+ else None
131
+ ),
132
+ cpu_cores,
133
+ )
134
+ # NOTE The lazy_import function does not work with the package below
135
+ # so we import it here manually
136
+ from rvc_logic.rvc.train.extract import preparing_files # noqa: PLC0415
137
+
138
+ preparing_files.generate_config(str(model_path))
139
+ preparing_files.generate_filelist(
140
+ str(model_path),
141
+ include_mutes,
142
+ f0_method,
143
+ embedder_model_id,
144
+ )
145
+ model_id,
146
+ )
rvc_logic/core_train/prepare.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module which exposes functionality for creating and preprocessing
3
+ datasets for training voice conversion models.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import TYPE_CHECKING
9
+
10
+ import lazy_loader as lazy
11
+
12
+ import shutil
13
+ from multiprocessing import cpu_count
14
+
15
+ from rvc_logic.common import TRAINING_MODELS_DIR
16
+ from rvc_logic.core.common import (
17
+ TRAINING_AUDIO_DIR,
18
+ validate_audio_dir_exists,
19
+ validate_audio_file_exists,
20
+ )
21
+ from rvc_logic.core.exceptions import (
22
+ Entity,
23
+ InvalidAudioFormatError,
24
+ NotProvidedError,
25
+ UIMessage,
26
+ )
27
+ from rvc_logic.typing_extra import (
28
+ AudioExt,
29
+ AudioNormalizationMode,
30
+ AudioSplitMethod,
31
+ TrainingSampleRate,
32
+ )
33
+
34
+ if TYPE_CHECKING:
35
+ from collections.abc import Sequence
36
+ from pathlib import Path
37
+
38
+ import static_ffmpeg
39
+
40
+ from rvc_logic.typing_extra import StrPath
41
+ else:
42
+ static_ffmpeg = lazy.load("static_ffmpeg")
43
+
44
+
45
+ def populate_dataset(name: str, audio_files: Sequence[StrPath]) -> Path:
46
+ """
47
+ Populate the dataset with the provided name with the provided audio
48
+ files.
49
+
50
+ If no dataset with the provided name exists, a new dataset with the
51
+ provided name will be created. If any of audio files already exist
52
+ in the dataset, they will be overwritten.
53
+
54
+ Parameters
55
+ ----------
56
+ name : str
57
+ The name of the dataset to populate.
58
+ audio_files : list[StrPath]
59
+ The audio files to populate the dataset with.
60
+
61
+ Returns
62
+ -------
63
+ The path to the dataset with the provided name.
64
+
65
+ Raises
66
+ ------
67
+ NotProvidedError
68
+ If no dataset name or no audio files are provided.
69
+
70
+ InvalidAudioFormatError
71
+ If any of the provided audio files are not in a valid format.
72
+
73
+ """
74
+ if not name:
75
+ raise NotProvidedError(Entity.DATASET_NAME)
76
+
77
+ if not audio_files:
78
+ raise NotProvidedError(Entity.FILES, ui_msg=UIMessage.NO_UPLOADED_FILES)
79
+
80
+ static_ffmpeg.add_paths(weak=True)
81
+
82
+ import pydub.utils as pydub_utils # noqa: PLC0415
83
+
84
+ audio_paths: list[Path] = []
85
+ for audio_file in audio_files:
86
+ audio_path = validate_audio_file_exists(audio_file, Entity.FILE)
87
+ audio_info = pydub_utils.mediainfo(str(audio_file))
88
+ if not (
89
+ audio_info["format_name"]
90
+ in {
91
+ AudioExt.WAV,
92
+ AudioExt.FLAC,
93
+ AudioExt.MP3,
94
+ AudioExt.OGG,
95
+ AudioExt.AAC,
96
+ }
97
+ or AudioExt.M4A in audio_info["format_name"]
98
+ ):
99
+ raise InvalidAudioFormatError(audio_path, [e.value for e in AudioExt])
100
+ audio_paths.append(audio_path)
101
+
102
+ dataset_path = TRAINING_AUDIO_DIR / name.strip()
103
+
104
+ dataset_path.mkdir(parents=True, exist_ok=True)
105
+
106
+ for audio_path in audio_paths:
107
+ shutil.copyfile(audio_path, dataset_path / audio_path.name)
108
+
109
+ return dataset_path
110
+
111
+
112
+ def preprocess_dataset(
113
+ model_name: str,
114
+ dataset: StrPath,
115
+ sample_rate: TrainingSampleRate = TrainingSampleRate.HZ_40K,
116
+ normalization_mode: AudioNormalizationMode = AudioNormalizationMode.POST,
117
+ filter_audio: bool = True,
118
+ clean_audio: bool = False,
119
+ clean_strength: float = 0.7,
120
+ split_method: AudioSplitMethod = AudioSplitMethod.AUTOMATIC,
121
+ chunk_len: float = 3.0,
122
+ overlap_len: float = 0.3,
123
+ cpu_cores: int = cpu_count(),
124
+ ) -> None:
125
+ """
126
+ Preprocess a dataset of audio files for training a voice model.
127
+
128
+ Parameters
129
+ ----------
130
+ model_name : str
131
+ The name of the voice model to train. If no voice model
132
+ with the provided name exists for training, a new voice model
133
+ for training will be created with the provided name. If a voice
134
+ model with the provided name already exists for training, then
135
+ its currently associated dataset will be replaced with the
136
+ provided dataset.
137
+ dataset : StrPath
138
+ The path to the dataset to preprocess.
139
+ sample_rate : TrainingSampleRate, default=TrainingSampleRate.HZ_40K
140
+ The target sample rate for the audio files in the provided
141
+ dataset.
142
+ normalization_mode : AudioNormalizationMode, default=POST
143
+ The audio normalization method to use for the audio files in
144
+ the provided dataset.
145
+ filter_audio : bool, default=True
146
+ Whether to remove low-frequency sounds from the audio files in
147
+ the provided dataset by applying a high-pass butterworth filter.
148
+ clean_audio : bool, default=False
149
+ Whether to clean the audio files in the provided dataset using
150
+ noise reduction algorithms.
151
+ clean_strength : float, default=0.7
152
+ The intensity of the cleaning to apply to the audio files in the
153
+ provided dataset.
154
+ split_method : AudioSplitMethod, default=AudioSplitMethod.AUTOMATIC
155
+ The method to use for splitting the audio files in the provided
156
+ dataset. Use the `Skip` method to skip splitting if the audio
157
+ files are already split. Use the `Simple` method if excessive
158
+ silence has already been removed from the audio files.
159
+ Use the `Automatic` method for automatic silence detection and
160
+ splitting around it.
161
+ chunk_len: float, default=3.0
162
+ length of split audio chunks when using the `Simple` split
163
+ method.
164
+ overlap_len: float, default=0.3
165
+ length of overlap between split audio chunks when using the
166
+ `Simple` split method.
167
+ cpu_cores : int, default=cpu_count()
168
+ The number of CPU cores to use for preprocessing.
169
+
170
+
171
+ Raises
172
+ ------
173
+ NotProvidedError
174
+ If no model name or dataset is provided.
175
+
176
+ """
177
+ if not model_name:
178
+ raise NotProvidedError(Entity.MODEL_NAME)
179
+
180
+ dataset_path = validate_audio_dir_exists(dataset, Entity.DATASET)
181
+
182
+ model_path = TRAINING_MODELS_DIR / model_name.strip()
183
+ model_path.mkdir(parents=True, exist_ok=True)
184
+
185
+ # NOTE The lazy_import function does not work with the package below
186
+ # so we import it here manually
187
+ from rvc_logic.rvc.train.preprocess import ( # noqa: PLC0415
188
+ preprocess as train_preprocess,
189
+ )
190
+
191
+ train_preprocess.preprocess_training_set(
192
+ str(dataset_path),
193
+ sample_rate,
194
+ cpu_cores,
195
+ str(model_path),
196
+ split_method,
197
+ filter_audio,
198
+ clean_audio,
199
+ clean_strength,
200
+ chunk_len,
201
+ overlap_len,
202
+ normalization_mode,
203
+ )
204
+ ion_mode,
205
+ )
rvc_logic/core_train/train.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module which exposes functionality for training voice conversion
3
+ models.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ import os
10
+ import re
11
+ import signal
12
+
13
+ from rvc_logic.common import PRETRAINED_MODELS_DIR
14
+ from rvc_logic.core.common import (
15
+ TRAINING_MODELS_DIR,
16
+ VOICE_MODELS_DIR,
17
+ copy_files_to_new_dir,
18
+ json_dump,
19
+ json_load,
20
+ validate_model,
21
+ )
22
+ from rvc_logic.core.exceptions import (
23
+ Entity,
24
+ ModelAsssociatedEntityNotFoundError,
25
+ ModelExistsError,
26
+ NotProvidedError,
27
+ PretrainedModelIncompatibleError,
28
+ PretrainedModelNotAvailableError,
29
+ Step,
30
+ )
31
+ from rvc_logic.core.train.common import validate_devices
32
+ from rvc_logic.core.train.typing_extra import ModelInfo, TrainingInfo
33
+ from rvc_logic.typing_extra import (
34
+ DeviceType,
35
+ IndexAlgorithm,
36
+ PrecisionType,
37
+ PretrainedType,
38
+ TrainingSampleRate,
39
+ Vocoder,
40
+ )
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ def _get_pretrained_model(
46
+ pretrained_type: PretrainedType,
47
+ vocoder: Vocoder,
48
+ sample_rate: TrainingSampleRate,
49
+ custom_pretrained: str | None = None,
50
+ ) -> tuple[str, str]:
51
+ """
52
+ Get the pretrained model to finetune a voice model on.
53
+
54
+ Parameters
55
+ ----------
56
+ pretrained_type : PretrainedType
57
+ The type of pretrained model to finetune the voice model on
58
+ vocoder : str
59
+ The vocoder to use for audio synthesis when training the voice
60
+ model.
61
+ sample_rate : int
62
+ The sample rate of the preprocessed dataset associated with the
63
+ voice model to be trained.
64
+ custom_pretrained : str, optional
65
+ The name of a custom pretrained model to finetune the voice
66
+ model on
67
+
68
+ Returns
69
+ -------
70
+ pg : str
71
+ The path to the generator of the pretrained model to finetune.
72
+ pd : str
73
+ The path to the discriminator of the pretrained model to
74
+ finetune.
75
+
76
+ Raises
77
+ ------
78
+ ModelAsssociatedEntityNotFoundError
79
+ If the voice model to be trained does not have an associated
80
+ dataset file list or if a custom pretrained
81
+ generator/discriminator model does not have an associated
82
+ generator or discriminator.
83
+ PretrainedModelIncompatibleError
84
+ if a custom pretrained model is not compatible with the sample
85
+ rate of the preprocessed dataset associated with the voice model
86
+ to be trained.
87
+ PretrainedModelNotAvailableError
88
+ If no default pretrained model is available for the provided
89
+ vocoder and sample rate.
90
+
91
+ """
92
+ match pretrained_type:
93
+ case PretrainedType.NONE:
94
+ pg, pd = "", ""
95
+ case PretrainedType.DEFAULT:
96
+ base_path = PRETRAINED_MODELS_DIR / vocoder.lower()
97
+ pg = base_path / f"f0G{str(sample_rate)[:2]}k.pth"
98
+ pd = base_path / f"f0D{str(sample_rate)[:2]}k.pth"
99
+ if not pg.is_file() or not pd.is_file():
100
+ raise PretrainedModelNotAvailableError(
101
+ name=vocoder, sample_rate=sample_rate, download=False
102
+ )
103
+ pg, pd = str(pg), str(pd)
104
+ case PretrainedType.CUSTOM:
105
+ custom_pretrained_path = validate_model(
106
+ custom_pretrained,
107
+ Entity.CUSTOM_PRETRAINED_MODEL,
108
+ )
109
+ # NOTE simply done to appease the type checker
110
+ custom_pretrained = custom_pretrained_path.name
111
+
112
+ # TODO need to make this cleaner
113
+ custom_pretrained_sample_rate = int(custom_pretrained.split(" ")[-1])
114
+ if not custom_pretrained_sample_rate == sample_rate:
115
+ raise PretrainedModelIncompatibleError(custom_pretrained, sample_rate)
116
+
117
+ pg = next(
118
+ (
119
+ str(path)
120
+ for path in custom_pretrained_path.iterdir()
121
+ if re.match(r"^(G|f0G).*\.pth$|.*G\.pth$", path.name)
122
+ ),
123
+ None,
124
+ )
125
+ if pg is None:
126
+ raise ModelAsssociatedEntityNotFoundError(
127
+ Entity.GENERATOR,
128
+ custom_pretrained,
129
+ )
130
+ pd = next(
131
+ (
132
+ str(path)
133
+ for path in custom_pretrained_path.iterdir()
134
+ if re.match(r"^(D|f0D).*\.pth$|.*D\.pth$", path.name)
135
+ ),
136
+ None,
137
+ )
138
+ if pd is None:
139
+ raise ModelAsssociatedEntityNotFoundError(
140
+ Entity.DISCRIMINATOR,
141
+ custom_pretrained,
142
+ )
143
+
144
+ return pg, pd
145
+
146
+
147
+ def run_training(
148
+ model_name: str,
149
+ num_epochs: int = 500,
150
+ batch_size: int = 8,
151
+ detect_overtraining: bool = False,
152
+ overtraining_threshold: int = 50,
153
+ vocoder: Vocoder = Vocoder.HIFI_GAN,
154
+ index_algorithm: IndexAlgorithm = IndexAlgorithm.AUTO,
155
+ pretrained_type: PretrainedType = PretrainedType.DEFAULT,
156
+ custom_pretrained: str | None = None,
157
+ save_interval: int = 10,
158
+ save_all_checkpoints: bool = False,
159
+ save_all_weights: bool = False,
160
+ clear_saved_data: bool = False,
161
+ upload_model: bool = False,
162
+ upload_name: str | None = None,
163
+ hardware_acceleration: DeviceType = DeviceType.AUTOMATIC,
164
+ gpu_ids: set[int] | None = None,
165
+ precision: PrecisionType = PrecisionType.FP32,
166
+ preload_dataset: bool = False,
167
+ reduce_memory_usage: bool = False,
168
+ ) -> list[str] | None:
169
+ """
170
+
171
+ Train a voice model using its associated preprocessed dataset and
172
+ extracted features.
173
+
174
+ Parameters
175
+ ----------
176
+ model_name : str
177
+ The name of the voice model to train.
178
+ num_epochs : int, default=500
179
+ The number of epochs to train the voice model. A higher number
180
+ can improve voice model performance but may lead to
181
+ overtraining.
182
+ batch_size : int, default=8
183
+ The number of samples to include in each training batch. It is
184
+ advisable to align this value with the available VRAM of your
185
+ GPU. A setting of 4 offers improved accuracy but slower
186
+ processing, while 8 provides faster and standard results.
187
+ detect_overtraining : bool, default=False
188
+ Whether to detect overtraining to prevent the voice model from
189
+ learning the training data too well and losing the ability to
190
+ generalize to new data.
191
+ overtraining_threshold : int, default=50
192
+ The maximum number of epochs to continue training without any
193
+ observed improvement in voice model performance.
194
+ vocoder : Vocoder, default=Vocoder.HIFI_GAN
195
+ The vocoder to use for audio synthesis during training. HiFi-GAN
196
+ provides basic audio fidelity, while RefineGAN provides the
197
+ highest audio fidelity.
198
+ index_algorithm : IndexAlgorithm, default=IndexAlgorithm.AUTO
199
+ The method to use for generating an index file for the trained
200
+ voice model. KMeans is particularly useful for large datasets.
201
+ pretrained_type : PretrainedType, default=PretrainedType.DEFAULT
202
+ The type of pretrained model to finetune the voice model on.
203
+ "None" will train the voice model from scratch, while
204
+ "Default" will use a pretrained model tailored to the specific
205
+ voice model architecture. "Custom" will use a custom pretrained
206
+ model that you provide.
207
+ custom_pretrained: str, optional
208
+ The name of a custom pretrained model to finetune the voice
209
+ model on.
210
+ save_interval : int, default=10
211
+ The epoch interval at which to to save voice model weights and
212
+ checkpoints. The best model weights are always saved regardless
213
+ of this setting.
214
+ save_all_checkpoints : bool, default=False
215
+ Whether to save a unique checkpoint at each save interval. If
216
+ not enabled, only the latest checkpoint will be saved at each
217
+ interval.
218
+ save_all_weights : bool, default=False
219
+ Whether to save unique voice model weights at each save
220
+ interval. If not enabled, only the best voice model weights will
221
+ be saved.
222
+ clear_saved_data : bool, default=False
223
+ Whether to delete any existing training data associated
224
+ with the voice model before training commences. Enable this
225
+ setting only if you are training a new voice model from scratch
226
+ or restarting training.
227
+ upload_model : bool, default=False
228
+ Whether to automatically upload the trained voice model so that
229
+ it can be used for audio generation tasks within the Ultimate
230
+ RVC app.
231
+ upload_name : str, optional
232
+ The name to give the uploaded voice model.
233
+ hardware_acceleration : DeviceType, default=DeviceType.AUTOMATIC
234
+ The type of hardware acceleration to use when training the voice
235
+ model. `AUTOMATIC` will select the first available GPU and fall
236
+ back to CPU if no GPUs are available.
237
+ gpu_ids : set[int], optional
238
+ Set of ids of the GPUs to use for training the voice model when
239
+ `GPU` is selected for hardware acceleration.
240
+ precision : PrecisionType, default=PrecisionType.FP32
241
+ The precision type to use when training the voice model. FP16
242
+ and BF16 can reduce VRAM usage and speed up training on
243
+ supported hardware.
244
+ preload_dataset : bool, default=False
245
+ Whether to preload all training data into GPU memory. This can
246
+ improve training speed but requires a lot of VRAM.
247
+ reduce_memory_usage : bool, default=False
248
+ Whether to reduce VRAM usage at the cost of slower training
249
+ speed by enabling activation checkpointing. This is useful for
250
+ GPUs with limited memory (e.g., <6GB VRAM) or when training with
251
+ a batch size larger than what your GPU can normally accommodate.
252
+
253
+ Returns
254
+ -------
255
+ list[str] | None
256
+ A list containing the paths to the best weights file and the
257
+ index file for the trained voice model, if they exist.
258
+ Otherwise, None.
259
+
260
+ Raises
261
+ ------
262
+ ModelAsssociatedEntityNotFoundError
263
+ If the voice model to be trained does not have an associated
264
+ dataset file list.
265
+ NotProvidedError
266
+ If an upload name is not provided when the upload parameter is
267
+ set
268
+ ModelExistsError
269
+ If a voice with the provided upload name already exists when the
270
+ upload parameter is set
271
+
272
+
273
+ """
274
+ model_path = validate_model(model_name, Entity.TRAINING_MODEL)
275
+ filelist_path = model_path / "filelist.txt"
276
+ if not filelist_path.is_file():
277
+ raise ModelAsssociatedEntityNotFoundError(
278
+ Entity.DATASET_FILE_LIST,
279
+ model_name,
280
+ Step.FEATURE_EXTRACTION,
281
+ )
282
+ upload_model_path = None
283
+ if upload_model:
284
+ if not upload_name:
285
+ raise NotProvidedError(Entity.UPLOAD_NAME)
286
+ upload_model_path = VOICE_MODELS_DIR / upload_name.strip()
287
+ if upload_model_path.is_dir():
288
+ raise ModelExistsError(Entity.VOICE_MODEL, upload_name)
289
+
290
+ model_info_dict = json_load(model_path / "model_info.json")
291
+
292
+ model_info = ModelInfo.model_validate(model_info_dict)
293
+ sample_rate = model_info.sample_rate
294
+
295
+ pg, pd = _get_pretrained_model(
296
+ pretrained_type,
297
+ vocoder,
298
+ sample_rate,
299
+ custom_pretrained,
300
+ )
301
+
302
+ from rvc_logic.rvc.train.train import main as train_main # noqa: PLC0415
303
+
304
+ device_type, device_ids = validate_devices(hardware_acceleration, gpu_ids)
305
+
306
+ train_main(
307
+ model_name,
308
+ sample_rate,
309
+ vocoder,
310
+ num_epochs,
311
+ batch_size,
312
+ save_interval,
313
+ not save_all_checkpoints,
314
+ save_all_weights,
315
+ pg,
316
+ pd,
317
+ detect_overtraining,
318
+ overtraining_threshold,
319
+ clear_saved_data,
320
+ preload_dataset,
321
+ reduce_memory_usage,
322
+ device_type,
323
+ device_ids,
324
+ precision,
325
+ )
326
+
327
+ model_file = model_path / f"{model_name}_best.pth"
328
+
329
+ if not model_file.is_file():
330
+ return None
331
+
332
+ from rvc_logic.rvc.train.process.extract_index import ( # noqa: PLC0415
333
+ main as extract_index_main,
334
+ )
335
+
336
+ extract_index_main(str(model_path), index_algorithm)
337
+
338
+ index_file = model_path / f"{model_name}.index"
339
+
340
+ if not index_file.is_file():
341
+ return None
342
+ if upload_model_path:
343
+ copy_files_to_new_dir([index_file, model_file], upload_model_path)
344
+ return [str(model_file), str(index_file)]
345
+
346
+
347
+ def stop_training(model_name: str) -> None:
348
+ """
349
+ Stop the training of a voice model.
350
+
351
+ Parameters
352
+ ----------
353
+ model_name : str
354
+ The name of the voice model to stop training for.
355
+
356
+ """
357
+ training_info_path = TRAINING_MODELS_DIR / model_name / "config.json"
358
+ try:
359
+ training_info_dict = json_load(training_info_path)
360
+ training_info = TrainingInfo.model_validate(training_info_dict)
361
+ process_ids = training_info.process_pids
362
+ for pid in process_ids:
363
+ os.kill(pid, signal.SIGTERM)
364
+ training_info.process_pids = []
365
+ updated_training_info_dict = training_info.model_dump()
366
+ json_dump(updated_training_info_dict, training_info_path)
367
+ except Exception as e: # noqa: BLE001
368
+ logger.error("Error stopping training: %s", e) # noqa: TRY400
369
+ s", e) # noqa: TRY400
rvc_logic/core_train/typing_extra.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module which defines extra types used by modules in the
3
+ rvc_logic.core.train package.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from pydantic import BaseModel, ConfigDict
9
+
10
+ from rvc_logic.typing_extra import TrainingSampleRate # noqa: TC002
11
+
12
+
13
+ class ModelInfo(BaseModel):
14
+ """
15
+ Information about a voice model to be trained.
16
+
17
+ Attributes
18
+ ----------
19
+ sample_rate : TrainingSampleRate
20
+ The sample rate of the post-processed audio to train the model
21
+ on.
22
+
23
+ """
24
+
25
+ sample_rate: TrainingSampleRate
26
+ # TODO add more attributes later
27
+
28
+
29
+ class TrainingInfo(BaseModel):
30
+ """
31
+ Information about the ongoing training of a voice model.
32
+
33
+ Attributes
34
+ ----------
35
+ process_pids : list[int], default = []
36
+ The ids of the processes running the training.
37
+
38
+ """
39
+
40
+ process_pids: list[int] = []
41
+ # TODO add more attributes later
42
+ model_config = ConfigDict(extra="allow")
43
+ ow")
rvc_logic/rvc/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ The rvc package is a collection of tools for voice cloning using the RVC
3
+ method.
4
+ """
rvc_logic/rvc/common.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """Common constants and functions for the RVC package."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ RVC_DIR = Path(__file__).resolve().parent
8
+ RVC_CONFIGS_DIR = RVC_DIR / "configs"
9
+ RVC_TRAINING_MODELS_DIR = RVC_DIR / "train" / "models"
rvc_logic/rvc/configs/32000.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "learning_rate": 1e-4,
6
+ "betas": [
7
+ 0.8,
8
+ 0.99
9
+ ],
10
+ "eps": 1e-9,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "c_mel": 45,
14
+ "c_kl": 1.0
15
+ },
16
+ "data": {
17
+ "max_wav_value": 32768.0,
18
+ "sample_rate": 32000,
19
+ "filter_length": 1024,
20
+ "hop_length": 320,
21
+ "win_length": 1024,
22
+ "n_mel_channels": 80,
23
+ "mel_fmin": 0.0,
24
+ "mel_fmax": null
25
+ },
26
+ "model": {
27
+ "inter_channels": 192,
28
+ "hidden_channels": 192,
29
+ "filter_channels": 768,
30
+ "text_enc_hidden_dim": 768,
31
+ "n_heads": 2,
32
+ "n_layers": 6,
33
+ "kernel_size": 3,
34
+ "p_dropout": 0,
35
+ "resblock": "1",
36
+ "resblock_kernel_sizes": [
37
+ 3,
38
+ 7,
39
+ 11
40
+ ],
41
+ "resblock_dilation_sizes": [
42
+ [
43
+ 1,
44
+ 3,
45
+ 5
46
+ ],
47
+ [
48
+ 1,
49
+ 3,
50
+ 5
51
+ ],
52
+ [
53
+ 1,
54
+ 3,
55
+ 5
56
+ ]
57
+ ],
58
+ "upsample_rates": [
59
+ 10,
60
+ 8,
61
+ 2,
62
+ 2
63
+ ],
64
+ "upsample_initial_channel": 512,
65
+ "upsample_kernel_sizes": [
66
+ 20,
67
+ 16,
68
+ 4,
69
+ 4
70
+ ],
71
+ "use_spectral_norm": false,
72
+ "gin_channels": 256,
73
+ "spk_embed_dim": 109
74
+ }
75
+ }
rvc_logic/rvc/configs/40000.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "learning_rate": 1e-4,
6
+ "betas": [
7
+ 0.8,
8
+ 0.99
9
+ ],
10
+ "eps": 1e-9,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "c_mel": 45,
14
+ "c_kl": 1.0
15
+ },
16
+ "data": {
17
+ "max_wav_value": 32768.0,
18
+ "sample_rate": 40000,
19
+ "filter_length": 2048,
20
+ "hop_length": 400,
21
+ "win_length": 2048,
22
+ "n_mel_channels": 125,
23
+ "mel_fmin": 0.0,
24
+ "mel_fmax": null
25
+ },
26
+ "model": {
27
+ "inter_channels": 192,
28
+ "hidden_channels": 192,
29
+ "filter_channels": 768,
30
+ "text_enc_hidden_dim": 768,
31
+ "n_heads": 2,
32
+ "n_layers": 6,
33
+ "kernel_size": 3,
34
+ "p_dropout": 0,
35
+ "resblock": "1",
36
+ "resblock_kernel_sizes": [
37
+ 3,
38
+ 7,
39
+ 11
40
+ ],
41
+ "resblock_dilation_sizes": [
42
+ [
43
+ 1,
44
+ 3,
45
+ 5
46
+ ],
47
+ [
48
+ 1,
49
+ 3,
50
+ 5
51
+ ],
52
+ [
53
+ 1,
54
+ 3,
55
+ 5
56
+ ]
57
+ ],
58
+ "upsample_rates": [
59
+ 10,
60
+ 10,
61
+ 2,
62
+ 2
63
+ ],
64
+ "upsample_initial_channel": 512,
65
+ "upsample_kernel_sizes": [
66
+ 16,
67
+ 16,
68
+ 4,
69
+ 4
70
+ ],
71
+ "use_spectral_norm": false,
72
+ "gin_channels": 256,
73
+ "spk_embed_dim": 109
74
+ }
75
+ }
rvc_logic/rvc/configs/48000.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "learning_rate": 1e-4,
6
+ "betas": [
7
+ 0.8,
8
+ 0.99
9
+ ],
10
+ "eps": 1e-9,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 17280,
13
+ "c_mel": 45,
14
+ "c_kl": 1.0
15
+ },
16
+ "data": {
17
+ "max_wav_value": 32768.0,
18
+ "sample_rate": 48000,
19
+ "filter_length": 2048,
20
+ "hop_length": 480,
21
+ "win_length": 2048,
22
+ "n_mel_channels": 128,
23
+ "mel_fmin": 0.0,
24
+ "mel_fmax": null
25
+ },
26
+ "model": {
27
+ "inter_channels": 192,
28
+ "hidden_channels": 192,
29
+ "filter_channels": 768,
30
+ "text_enc_hidden_dim": 768,
31
+ "n_heads": 2,
32
+ "n_layers": 6,
33
+ "kernel_size": 3,
34
+ "p_dropout": 0,
35
+ "resblock": "1",
36
+ "resblock_kernel_sizes": [
37
+ 3,
38
+ 7,
39
+ 11
40
+ ],
41
+ "resblock_dilation_sizes": [
42
+ [
43
+ 1,
44
+ 3,
45
+ 5
46
+ ],
47
+ [
48
+ 1,
49
+ 3,
50
+ 5
51
+ ],
52
+ [
53
+ 1,
54
+ 3,
55
+ 5
56
+ ]
57
+ ],
58
+ "upsample_rates": [
59
+ 12,
60
+ 10,
61
+ 2,
62
+ 2
63
+ ],
64
+ "upsample_initial_channel": 512,
65
+ "upsample_kernel_sizes": [
66
+ 24,
67
+ 20,
68
+ 4,
69
+ 4
70
+ ],
71
+ "use_spectral_norm": false,
72
+ "gin_channels": 256,
73
+ "spk_embed_dim": 109
74
+ }
75
+ }
rvc_logic/rvc/configs/config.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pathlib
4
+
5
+ import torch
6
+
7
+ from rvc_logic.rvc.common import RVC_CONFIGS_DIR
8
+
9
+ version_config_paths = [
10
+ os.path.join("48000.json"),
11
+ os.path.join("40000.json"),
12
+ os.path.join("32000.json"),
13
+ ]
14
+
15
+
16
+ def singleton(cls):
17
+ instances = {}
18
+
19
+ def get_instance(*args, **kwargs):
20
+ if cls not in instances:
21
+ instances[cls] = cls(*args, **kwargs)
22
+ return instances[cls]
23
+
24
+ return get_instance
25
+
26
+
27
+ @singleton
28
+ class Config:
29
+ def __init__(self):
30
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
31
+ self.gpu_name = (
32
+ torch.cuda.get_device_name(int(self.device.split(":")[-1]))
33
+ if self.device.startswith("cuda")
34
+ else None
35
+ )
36
+ self.json_config = self.load_config_json()
37
+ self.gpu_mem = None
38
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
39
+
40
+ def load_config_json(self) -> dict:
41
+ configs = {}
42
+ for config_file in version_config_paths:
43
+ config_path = os.path.join(str(RVC_CONFIGS_DIR), config_file)
44
+ with pathlib.Path(config_path).open() as f:
45
+ configs[config_file] = json.load(f)
46
+ return configs
47
+
48
+ def device_config(self):
49
+ if self.device.startswith("cuda"):
50
+ self.set_cuda_config()
51
+ else:
52
+ self.device = "cpu"
53
+
54
+ # Configuration for 6GB GPU memory
55
+ x_pad, x_query, x_center, x_max = (1, 6, 38, 41)
56
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
57
+ # Configuration for 5GB GPU memory
58
+ x_pad, x_query, x_center, x_max = (1, 5, 30, 32)
59
+
60
+ return x_pad, x_query, x_center, x_max
61
+
62
+ def set_cuda_config(self):
63
+ i_device = int(self.device.split(":")[-1])
64
+ self.gpu_name = torch.cuda.get_device_name(i_device)
65
+
66
+ self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // (
67
+ 1024**3
68
+ )
69
+
70
+
71
+ def max_vram_gpu(gpu):
72
+ if torch.cuda.is_available():
73
+ gpu_properties = torch.cuda.get_device_properties(gpu)
74
+ total_memory_gb = round(gpu_properties.total_memory / 1024 / 1024 / 1024)
75
+ return total_memory_gb
76
+ return "8"
77
+
78
+
79
+ def get_gpu_info():
80
+ ngpu = torch.cuda.device_count()
81
+ gpu_infos = []
82
+ if torch.cuda.is_available() or ngpu != 0:
83
+ for i in range(ngpu):
84
+ gpu_name = torch.cuda.get_device_name(i)
85
+ mem = int(
86
+ torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024
87
+ + 0.4,
88
+ )
89
+ gpu_infos.append(f"{i}: {gpu_name} ({mem} GB)")
90
+ if len(gpu_infos) > 0:
91
+ gpu_info = "\n".join(gpu_infos)
92
+ else:
93
+ gpu_info = (
94
+ "Unfortunately, there is no compatible GPU available to support your"
95
+ " training."
96
+ )
97
+ return gpu_info
98
+
99
+
100
+ def get_number_of_gpus():
101
+ if torch.cuda.is_available():
102
+ num_gpus = torch.cuda.device_count()
103
+ return "-".join(map(str, range(num_gpus)))
104
+ return "-"
105
+ "
rvc_logic/rvc/infer/infer.py ADDED
@@ -0,0 +1,528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING, Unpack
2
+
3
+ import logging
4
+ import os
5
+ import pathlib
6
+ import sys
7
+ import time
8
+ import traceback
9
+
10
+ import soxr
11
+
12
+ import numpy as np
13
+
14
+ import torch
15
+
16
+ import librosa
17
+ import soundfile as sf
18
+ from pedalboard import (
19
+ Bitcrush,
20
+ Chorus,
21
+ Clipping,
22
+ Compressor,
23
+ Delay,
24
+ Distortion,
25
+ Gain,
26
+ Limiter,
27
+ Pedalboard,
28
+ PitchShift,
29
+ Reverb,
30
+ )
31
+
32
+ now_dir = pathlib.Path.cwd()
33
+ sys.path.append(str(now_dir))
34
+ import lazy_loader as lazy
35
+
36
+ from rvc_logic.rvc.configs.config import Config
37
+ from rvc_logic.rvc.infer.pipeline import Pipeline as VC
38
+ from rvc_logic.rvc.infer.typing_extra import ConvertAudioKwArgs
39
+ from rvc_logic.rvc.lib.algorithm.synthesizers import Synthesizer
40
+ from rvc_logic.rvc.lib.tools.split_audio import merge_audio, process_audio
41
+ from rvc_logic.rvc.lib.utils import load_audio_infer, load_embedding
42
+ from rvc_logic.typing_extra import F0Method
43
+
44
+ if TYPE_CHECKING:
45
+ import noisereduce as nr
46
+ else:
47
+ nr = lazy.load("noisereduce")
48
+
49
+ # logging.getLogger("httpx").setLevel(logging.WARNING)
50
+ # logging.getLogger("httpcore").setLevel(logging.WARNING)
51
+ # logging.getLogger("faiss").setLevel(logging.WARNING)
52
+ # logging.getLogger("faiss.loader").setLevel(logging.WARNING)
53
+ logger = logging.getLogger(__name__)
54
+
55
+
56
+ class VoiceConverter:
57
+ """
58
+ A class for performing voice conversion using the Retrieval-Based Voice Conversion (RVC) method.
59
+ """
60
+
61
+ def __init__(self):
62
+ """
63
+ Initializes the VoiceConverter with default configuration, and sets up models and parameters.
64
+ """
65
+ self.config = Config() # Load configuration
66
+ self.hubert_model = (
67
+ None # Initialize the Hubert model (for embedding extraction)
68
+ )
69
+ self.last_embedder_model = None # Last used embedder model
70
+ self.tgt_sr = None # Target sampling rate for the output audio
71
+ self.net_g = None # Generator network for voice conversion
72
+ self.vc = None # Voice conversion pipeline instance
73
+ self.cpt = None # Checkpoint for loading model weights
74
+ self.version = None # Model version
75
+ self.n_spk = None # Number of speakers in the model
76
+ self.use_f0 = None # Whether the model uses F0
77
+ self.loaded_model = None
78
+
79
+ def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
80
+ """
81
+ Loads the HuBERT model for speaker embedding extraction.
82
+
83
+ Args:
84
+ embedder_model (str): Path to the pre-trained HuBERT model.
85
+ embedder_model_custom (str): Path to the custom HuBERT model.
86
+
87
+ """
88
+ self.hubert_model = load_embedding(embedder_model, embedder_model_custom)
89
+ self.hubert_model = self.hubert_model.to(self.config.device).float()
90
+ self.hubert_model.eval()
91
+
92
+ @staticmethod
93
+ def remove_audio_noise(data, sr, reduction_strength=0.7):
94
+ """
95
+ Removes noise from an audio file using the NoiseReduce library.
96
+
97
+ Args:
98
+ data (numpy.ndarray): The audio data as a NumPy array.
99
+ sr (int): The sample rate of the audio data.
100
+ reduction_strength (float): Strength of the noise reduction. Default is 0.7.
101
+
102
+ """
103
+ try:
104
+
105
+ reduced_noise = nr.reduce_noise(
106
+ y=data,
107
+ sr=sr,
108
+ prop_decrease=reduction_strength,
109
+ )
110
+ return reduced_noise
111
+ except Exception as error:
112
+ print(f"An error occurred removing audio noise: {error}")
113
+ return None
114
+
115
+ @staticmethod
116
+ def convert_audio_format(input_path, output_path, output_format):
117
+ """
118
+ Converts an audio file to a specified output format.
119
+
120
+ Args:
121
+ input_path (str): Path to the input audio file.
122
+ output_path (str): Path to the output audio file.
123
+ output_format (str): Desired audio format (e.g., "WAV", "MP3").
124
+
125
+ """
126
+ try:
127
+ if output_format != "WAV":
128
+ print(f"Saving audio as {output_format}...")
129
+ audio, sample_rate = librosa.load(input_path, sr=None)
130
+ common_sample_rates = [
131
+ 8000,
132
+ 11025,
133
+ 12000,
134
+ 16000,
135
+ 22050,
136
+ 24000,
137
+ 32000,
138
+ 44100,
139
+ 48000,
140
+ ]
141
+ target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
142
+ audio = librosa.resample(
143
+ audio,
144
+ orig_sr=sample_rate,
145
+ target_sr=target_sr,
146
+ res_type="soxr_vhq",
147
+ )
148
+ sf.write(output_path, audio, target_sr, format=output_format.lower())
149
+ return output_path
150
+ except Exception as error:
151
+ print(f"An error occurred converting the audio format: {error}")
152
+
153
+ @staticmethod
154
+ def post_process_audio(
155
+ audio_input,
156
+ sample_rate,
157
+ **kwargs,
158
+ ):
159
+ board = Pedalboard()
160
+ if kwargs.get("reverb"):
161
+ reverb = Reverb(
162
+ room_size=kwargs.get("reverb_room_size", 0.5),
163
+ damping=kwargs.get("reverb_damping", 0.5),
164
+ wet_level=kwargs.get("reverb_wet_level", 0.33),
165
+ dry_level=kwargs.get("reverb_dry_level", 0.4),
166
+ width=kwargs.get("reverb_width", 1.0),
167
+ freeze_mode=kwargs.get("reverb_freeze_mode", 0),
168
+ )
169
+ board.append(reverb)
170
+ if kwargs.get("pitch_shift"):
171
+ pitch_shift = PitchShift(semitones=kwargs.get("pitch_shift_semitones", 0))
172
+ board.append(pitch_shift)
173
+ if kwargs.get("limiter"):
174
+ limiter = Limiter(
175
+ threshold_db=kwargs.get("limiter_threshold", -6),
176
+ release_ms=kwargs.get("limiter_release", 0.05),
177
+ )
178
+ board.append(limiter)
179
+ if kwargs.get("gain"):
180
+ gain = Gain(gain_db=kwargs.get("gain_db", 0))
181
+ board.append(gain)
182
+ if kwargs.get("distortion"):
183
+ distortion = Distortion(drive_db=kwargs.get("distortion_gain", 25))
184
+ board.append(distortion)
185
+ if kwargs.get("chorus"):
186
+ chorus = Chorus(
187
+ rate_hz=kwargs.get("chorus_rate", 1.0),
188
+ depth=kwargs.get("chorus_depth", 0.25),
189
+ centre_delay_ms=kwargs.get("chorus_delay", 7),
190
+ feedback=kwargs.get("chorus_feedback", 0.0),
191
+ mix=kwargs.get("chorus_mix", 0.5),
192
+ )
193
+ board.append(chorus)
194
+ if kwargs.get("bitcrush"):
195
+ bitcrush = Bitcrush(bit_depth=kwargs.get("bitcrush_bit_depth", 8))
196
+ board.append(bitcrush)
197
+ if kwargs.get("clipping"):
198
+ clipping = Clipping(threshold_db=kwargs.get("clipping_threshold", 0))
199
+ board.append(clipping)
200
+ if kwargs.get("compressor"):
201
+ compressor = Compressor(
202
+ threshold_db=kwargs.get("compressor_threshold", 0),
203
+ ratio=kwargs.get("compressor_ratio", 1),
204
+ attack_ms=kwargs.get("compressor_attack", 1.0),
205
+ release_ms=kwargs.get("compressor_release", 100),
206
+ )
207
+ board.append(compressor)
208
+ if kwargs.get("delay"):
209
+ delay = Delay(
210
+ delay_seconds=kwargs.get("delay_seconds", 0.5),
211
+ feedback=kwargs.get("delay_feedback", 0.0),
212
+ mix=kwargs.get("delay_mix", 0.5),
213
+ )
214
+ board.append(delay)
215
+ return board(audio_input, sample_rate)
216
+
217
+ def convert_audio(
218
+ self,
219
+ audio_input_path: str,
220
+ audio_output_path: str,
221
+ model_path: str,
222
+ index_path: str,
223
+ pitch: int = 0,
224
+ f0_method: F0Method = "rmvpe",
225
+ index_rate: float = 0.75,
226
+ volume_envelope: float = 1,
227
+ protect: float = 0.5,
228
+ split_audio: bool = False,
229
+ f0_autotune: bool = False,
230
+ f0_autotune_strength: float = 1,
231
+ embedder_model: str = "contentvec",
232
+ embedder_model_custom: str | None = None,
233
+ clean_audio: bool = False,
234
+ clean_strength: float = 0.5,
235
+ export_format: str = "WAV",
236
+ post_process: bool = False,
237
+ resample_sr: int = 0,
238
+ sid: int = 0,
239
+ proposed_pitch: bool = False,
240
+ proposed_pitch_threshold: float = 155.0,
241
+ **kwargs: Unpack[ConvertAudioKwArgs],
242
+ ):
243
+ """
244
+ Performs voice conversion on the input audio.
245
+
246
+ Args:
247
+ pitch (int): Key for F0 up-sampling.
248
+ index_rate (float): Rate for index matching.
249
+ volume_envelope (int): RMS mix rate.
250
+ protect (float): Protection rate for certain audio segments.
251
+ f0_method (str): Method for F0 extraction.
252
+ audio_input_path (str): Path to the input audio file.
253
+ audio_output_path (str): Path to the output audio file.
254
+ model_path (str): Path to the voice conversion model.
255
+ index_path (str): Path to the index file.
256
+ split_audio (bool): Whether to split the audio for processing.
257
+ f0_autotune (bool): Whether to use F0 autotune.
258
+ clean_audio (bool): Whether to clean the audio.
259
+ clean_strength (float): Strength of the audio cleaning.
260
+ export_format (str): Format for exporting the audio.
261
+ embedder_model (str): Path to the embedder model.
262
+ embedder_model_custom (str): Path to the custom embedder model.
263
+ resample_sr (int, optional): Resample sampling rate. Default is 0.
264
+ sid (int, optional): Speaker ID. Default is 0.
265
+ **kwargs: Additional keyword arguments.
266
+
267
+ """
268
+ if not model_path:
269
+ logger.info("No model path provided. Aborting conversion.")
270
+ return
271
+
272
+ self.get_vc(model_path, sid)
273
+ start_time = time.time()
274
+ logger.info("Converting audio '%s'...", audio_input_path)
275
+
276
+ audio = load_audio_infer(
277
+ audio_input_path,
278
+ 16000,
279
+ **kwargs,
280
+ )
281
+ audio_max = np.abs(audio).max() / 0.95
282
+
283
+ if audio_max > 1:
284
+ audio /= audio_max
285
+
286
+ if not self.hubert_model or embedder_model != self.last_embedder_model:
287
+ self.load_hubert(embedder_model, embedder_model_custom)
288
+ self.last_embedder_model = embedder_model
289
+
290
+ file_index = (
291
+ index_path.strip()
292
+ .strip('"')
293
+ .strip("\n")
294
+ .strip('"')
295
+ .strip()
296
+ .replace("trained", "added")
297
+ )
298
+
299
+ if self.tgt_sr != resample_sr >= 16000:
300
+ self.tgt_sr = resample_sr
301
+
302
+ if split_audio:
303
+ chunks, intervals = process_audio(audio, 16000)
304
+ logger.info("Audio split into %d chunks for processing.", len(chunks))
305
+ else:
306
+ chunks = []
307
+ chunks.append(audio)
308
+
309
+ converted_chunks = []
310
+ for c in chunks:
311
+ audio_opt = self.vc.pipeline(
312
+ model=self.hubert_model,
313
+ net_g=self.net_g,
314
+ sid=sid,
315
+ audio=c,
316
+ pitch=pitch,
317
+ f0_method=f0_method or F0Method.RMVPE,
318
+ file_index=file_index,
319
+ index_rate=index_rate,
320
+ pitch_guidance=self.use_f0,
321
+ volume_envelope=volume_envelope,
322
+ version=self.version,
323
+ protect=protect,
324
+ f0_autotune=f0_autotune,
325
+ f0_autotune_strength=f0_autotune_strength,
326
+ proposed_pitch=proposed_pitch,
327
+ proposed_pitch_threshold=proposed_pitch_threshold,
328
+ )
329
+ converted_chunks.append(audio_opt)
330
+ if split_audio:
331
+ logger.info("Converted audio chunk %d", len(converted_chunks))
332
+
333
+ if split_audio:
334
+ audio_opt = merge_audio(
335
+ chunks,
336
+ converted_chunks,
337
+ intervals,
338
+ 16000,
339
+ self.tgt_sr,
340
+ )
341
+ else:
342
+ audio_opt = converted_chunks[0]
343
+
344
+ if clean_audio:
345
+ cleaned_audio = self.remove_audio_noise(
346
+ audio_opt,
347
+ self.tgt_sr,
348
+ clean_strength,
349
+ )
350
+ if cleaned_audio is not None:
351
+ audio_opt = cleaned_audio
352
+
353
+ if post_process:
354
+ audio_opt = self.post_process_audio(
355
+ audio_input=audio_opt,
356
+ sample_rate=self.tgt_sr,
357
+ **kwargs,
358
+ )
359
+
360
+ sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV")
361
+ output_path_format = audio_output_path.replace(
362
+ ".wav",
363
+ f".{export_format.lower()}",
364
+ )
365
+ audio_output_path = self.convert_audio_format(
366
+ audio_output_path,
367
+ output_path_format,
368
+ export_format,
369
+ )
370
+
371
+ elapsed_time = time.time() - start_time
372
+ logger.info(
373
+ "Conversion completed at '%s' in %.2f seconds.",
374
+ audio_output_path,
375
+ elapsed_time,
376
+ )
377
+
378
+ def convert_audio_batch(
379
+ self,
380
+ audio_input_paths: str,
381
+ audio_output_path: str,
382
+ **kwargs,
383
+ ):
384
+ """
385
+ Performs voice conversion on a batch of input audio files.
386
+
387
+ Args:
388
+ audio_input_paths (str): List of paths to the input audio files.
389
+ audio_output_path (str): Path to the output audio file.
390
+ resample_sr (int, optional): Resample sampling rate. Default is 0.
391
+ sid (int, optional): Speaker ID. Default is 0.
392
+ **kwargs: Additional keyword arguments.
393
+
394
+ """
395
+ pid = os.getpid()
396
+ try:
397
+ with pathlib.Path(os.path.join(now_dir, "assets", "infer_pid.txt")).open(
398
+ "w",
399
+ ) as pid_file:
400
+ pid_file.write(str(pid))
401
+ start_time = time.time()
402
+ print(f"Converting audio batch '{audio_input_paths}'...")
403
+ audio_files = [
404
+ f
405
+ for f in os.listdir(audio_input_paths)
406
+ if f.lower().endswith(
407
+ (
408
+ "wav",
409
+ "mp3",
410
+ "flac",
411
+ "ogg",
412
+ "opus",
413
+ "m4a",
414
+ "mp4",
415
+ "aac",
416
+ "alac",
417
+ "wma",
418
+ "aiff",
419
+ "webm",
420
+ "ac3",
421
+ ),
422
+ )
423
+ ]
424
+ print(f"Detected {len(audio_files)} audio files for inference.")
425
+ for a in audio_files:
426
+ new_input = os.path.join(audio_input_paths, a)
427
+ new_output = os.path.splitext(a)[0] + "_output.wav"
428
+ new_output = os.path.join(audio_output_path, new_output)
429
+ if pathlib.Path(new_output).exists():
430
+ continue
431
+ self.convert_audio(
432
+ audio_input_path=new_input,
433
+ audio_output_path=new_output,
434
+ **kwargs,
435
+ )
436
+ print(f"Conversion completed at '{audio_input_paths}'.")
437
+ elapsed_time = time.time() - start_time
438
+ print(f"Batch conversion completed in {elapsed_time:.2f} seconds.")
439
+ except Exception as error:
440
+ print(f"An error occurred during audio batch conversion: {error}")
441
+ print(traceback.format_exc())
442
+ finally:
443
+ pathlib.Path(os.path.join(now_dir, "assets", "infer_pid.txt")).unlink()
444
+
445
+ def get_vc(self, weight_root, sid):
446
+ """
447
+ Loads the voice conversion model and sets up the pipeline.
448
+
449
+ Args:
450
+ weight_root (str): Path to the model weights.
451
+ sid (int): Speaker ID.
452
+
453
+ """
454
+ if sid == "" or sid == []:
455
+ self.cleanup_model()
456
+ if torch.cuda.is_available():
457
+ torch.cuda.empty_cache()
458
+
459
+ if not self.loaded_model or self.loaded_model != weight_root:
460
+ self.load_model(weight_root)
461
+ if self.cpt is not None:
462
+ self.setup_network()
463
+ self.setup_vc_instance()
464
+ self.loaded_model = weight_root
465
+ else:
466
+ self.vc = None
467
+ self.loaded_model = None
468
+
469
+ def cleanup_model(self):
470
+ """
471
+ Cleans up the model and releases resources.
472
+ """
473
+ if self.hubert_model is not None:
474
+ del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr
475
+ self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
476
+ if torch.cuda.is_available():
477
+ torch.cuda.empty_cache()
478
+
479
+ del self.net_g, self.cpt
480
+ if torch.cuda.is_available():
481
+ torch.cuda.empty_cache()
482
+ self.cpt = None
483
+
484
+ def load_model(self, weight_root):
485
+ """
486
+ Loads the model weights from the specified path.
487
+
488
+ Args:
489
+ weight_root (str): Path to the model weights.
490
+
491
+ """
492
+ self.cpt = (
493
+ torch.load(weight_root, map_location="cpu", weights_only=False)
494
+ if pathlib.Path(weight_root).is_file()
495
+ else None
496
+ )
497
+
498
+ def setup_network(self):
499
+ """
500
+ Sets up the network configuration based on the loaded checkpoint.
501
+ """
502
+ if self.cpt is not None:
503
+ self.tgt_sr = self.cpt["config"][-1]
504
+ self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]
505
+ self.use_f0 = self.cpt.get("f0", 1)
506
+
507
+ self.version = self.cpt.get("version", "v1")
508
+ self.text_enc_hidden_dim = 768 if self.version == "v2" else 256
509
+ self.vocoder = self.cpt.get("vocoder", "HiFi-GAN")
510
+ self.net_g = Synthesizer(
511
+ *self.cpt["config"],
512
+ use_f0=self.use_f0,
513
+ text_enc_hidden_dim=self.text_enc_hidden_dim,
514
+ vocoder=self.vocoder,
515
+ )
516
+ del self.net_g.enc_q
517
+ self.net_g.load_state_dict(self.cpt["weight"], strict=False)
518
+ self.net_g = self.net_g.to(self.config.device).float()
519
+ self.net_g.eval()
520
+
521
+ def setup_vc_instance(self):
522
+ """
523
+ Sets up the voice conversion pipeline instance based on the target sampling rate and configuration.
524
+ """
525
+ if self.cpt is not None:
526
+ self.vc = VC(self.tgt_sr, self.config)
527
+ self.n_spk = self.cpt["config"][-3]
528
+ f.cpt["config"][-3]
rvc_logic/rvc/infer/pipeline.py ADDED
@@ -0,0 +1,581 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+ import sys
3
+
4
+ import numpy as np
5
+ from scipy import signal
6
+
7
+ import faiss
8
+ import torch
9
+ import torch.nn.functional as F
10
+
11
+ import librosa
12
+
13
+ now_dir = pathlib.Path.cwd()
14
+ sys.path.append(str(now_dir))
15
+
16
+ import logging
17
+
18
+ from rvc_logic.rvc.lib.predictors.f0 import CREPE, FCPE, RMVPE
19
+
20
+ # logging.getLogger("faiss").setLevel(logging.WARNING)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Constants for high-pass filter
24
+ FILTER_ORDER = 5
25
+ CUTOFF_FREQUENCY = 48 # Hz
26
+ SAMPLE_RATE = 16000 # Hz
27
+ bh, ah = signal.butter(
28
+ N=FILTER_ORDER,
29
+ Wn=CUTOFF_FREQUENCY,
30
+ btype="high",
31
+ fs=SAMPLE_RATE,
32
+ )
33
+
34
+
35
+ class AudioProcessor:
36
+ """
37
+ A class for processing audio signals, specifically for adjusting RMS levels.
38
+ """
39
+
40
+ def change_rms(
41
+ source_audio: np.ndarray,
42
+ source_rate: int,
43
+ target_audio: np.ndarray,
44
+ target_rate: int,
45
+ rate: float,
46
+ ) -> np.ndarray:
47
+ """
48
+ Adjust the RMS level of target_audio to match the RMS of source_audio, with a given blending rate.
49
+
50
+ Args:
51
+ source_audio: The source audio signal as a NumPy array.
52
+ source_rate: The sampling rate of the source audio.
53
+ target_audio: The target audio signal to adjust.
54
+ target_rate: The sampling rate of the target audio.
55
+ rate: The blending rate between the source and target RMS levels.
56
+
57
+ """
58
+ # Calculate RMS of both audio data
59
+ rms1 = librosa.feature.rms(
60
+ y=source_audio,
61
+ frame_length=source_rate // 2 * 2,
62
+ hop_length=source_rate // 2,
63
+ )
64
+ rms2 = librosa.feature.rms(
65
+ y=target_audio,
66
+ frame_length=target_rate // 2 * 2,
67
+ hop_length=target_rate // 2,
68
+ )
69
+
70
+ # Interpolate RMS to match target audio length
71
+ rms1 = F.interpolate(
72
+ torch.from_numpy(rms1).float().unsqueeze(0),
73
+ size=target_audio.shape[0],
74
+ mode="linear",
75
+ ).squeeze()
76
+ rms2 = F.interpolate(
77
+ torch.from_numpy(rms2).float().unsqueeze(0),
78
+ size=target_audio.shape[0],
79
+ mode="linear",
80
+ ).squeeze()
81
+ rms2 = torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6)
82
+
83
+ # Adjust target audio RMS based on the source audio RMS
84
+ adjusted_audio = (
85
+ target_audio
86
+ * (torch.pow(rms1, 1 - rate) * torch.pow(rms2, rate - 1)).numpy()
87
+ )
88
+ return adjusted_audio
89
+
90
+
91
+ class Autotune:
92
+ """
93
+ A class for applying autotune to a given fundamental frequency (F0) contour.
94
+ """
95
+
96
+ def __init__(self):
97
+ """
98
+ Initializes the Autotune class with a set of reference frequencies.
99
+ """
100
+ self.note_dict = [
101
+ 49.00, # G1
102
+ 51.91, # G#1 / Ab1
103
+ 55.00, # A1
104
+ 58.27, # A#1 / Bb1
105
+ 61.74, # B1
106
+ 65.41, # C2
107
+ 69.30, # C#2 / Db2
108
+ 73.42, # D2
109
+ 77.78, # D#2 / Eb2
110
+ 82.41, # E2
111
+ 87.31, # F2
112
+ 92.50, # F#2 / Gb2
113
+ 98.00, # G2
114
+ 103.83, # G#2 / Ab2
115
+ 110.00, # A2
116
+ 116.54, # A#2 / Bb2
117
+ 123.47, # B2
118
+ 130.81, # C3
119
+ 138.59, # C#3 / Db3
120
+ 146.83, # D3
121
+ 155.56, # D#3 / Eb3
122
+ 164.81, # E3
123
+ 174.61, # F3
124
+ 185.00, # F#3 / Gb3
125
+ 196.00, # G3
126
+ 207.65, # G#3 / Ab3
127
+ 220.00, # A3
128
+ 233.08, # A#3 / Bb3
129
+ 246.94, # B3
130
+ 261.63, # C4
131
+ 277.18, # C#4 / Db4
132
+ 293.66, # D4
133
+ 311.13, # D#4 / Eb4
134
+ 329.63, # E4
135
+ 349.23, # F4
136
+ 369.99, # F#4 / Gb4
137
+ 392.00, # G4
138
+ 415.30, # G#4 / Ab4
139
+ 440.00, # A4
140
+ 466.16, # A#4 / Bb4
141
+ 493.88, # B4
142
+ 523.25, # C5
143
+ 554.37, # C#5 / Db5
144
+ 587.33, # D5
145
+ 622.25, # D#5 / Eb5
146
+ 659.25, # E5
147
+ 698.46, # F5
148
+ 739.99, # F#5 / Gb5
149
+ 783.99, # G5
150
+ 830.61, # G#5 / Ab5
151
+ 880.00, # A5
152
+ 932.33, # A#5 / Bb5
153
+ 987.77, # B5
154
+ 1046.50, # C6
155
+ ]
156
+
157
+ def autotune_f0(self, f0, f0_autotune_strength):
158
+ """
159
+ Autotunes a given F0 contour by snapping each frequency to the closest reference frequency.
160
+
161
+ Args:
162
+ f0: The input F0 contour as a NumPy array.
163
+
164
+ """
165
+ autotuned_f0 = np.zeros_like(f0)
166
+ for i, freq in enumerate(f0):
167
+ closest_note = min(self.note_dict, key=lambda x: abs(x - freq))
168
+ autotuned_f0[i] = freq + (closest_note - freq) * f0_autotune_strength
169
+ return autotuned_f0
170
+
171
+
172
+ class Pipeline:
173
+ """
174
+ The main pipeline class for performing voice conversion, including preprocessing, F0 estimation,
175
+ voice conversion using a model, and post-processing.
176
+ """
177
+
178
+ def __init__(self, tgt_sr, config):
179
+ """
180
+ Initializes the Pipeline class with target sampling rate and configuration parameters.
181
+
182
+ Args:
183
+ tgt_sr: The target sampling rate for the output audio.
184
+ config: A configuration object containing various parameters for the pipeline.
185
+
186
+ """
187
+ self.x_pad = config.x_pad
188
+ self.x_query = config.x_query
189
+ self.x_center = config.x_center
190
+ self.x_max = config.x_max
191
+ self.sample_rate = 16000
192
+ self.tgt_sr = tgt_sr
193
+ self.window = 160
194
+ self.t_pad = self.sample_rate * self.x_pad
195
+ self.t_pad_tgt = tgt_sr * self.x_pad
196
+ self.t_pad2 = self.t_pad * 2
197
+ self.t_query = self.sample_rate * self.x_query
198
+ self.t_center = self.sample_rate * self.x_center
199
+ self.t_max = self.sample_rate * self.x_max
200
+ self.time_step = self.window / self.sample_rate * 1000
201
+ self.f0_min = 50
202
+ self.f0_max = 1100
203
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
204
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
205
+ self.device = config.device
206
+ self.autotune = Autotune()
207
+
208
+ def get_f0(
209
+ self,
210
+ x,
211
+ p_len,
212
+ f0_method: str = "rmvpe",
213
+ pitch: int = 0,
214
+ f0_autotune: bool = False,
215
+ f0_autotune_strength: float = 1.0,
216
+ proposed_pitch: bool = False,
217
+ proposed_pitch_threshold: float = 155.0,
218
+ ):
219
+ """
220
+ Estimates the fundamental frequency (F0) of a given audio signal using various methods.
221
+
222
+ Args:
223
+ x: The input audio signal as a NumPy array.
224
+ p_len: Desired length of the F0 output.
225
+ pitch: Key to adjust the pitch of the F0 contour.
226
+ f0_method: Method to use for F0 estimation (e.g., "crepe").
227
+ f0_autotune: Whether to apply autotune to the F0 contour.
228
+ proposed_pitch: whether to apply proposed pitch adjustment
229
+ proposed_pitch_threshold: target frequency, 155.0 for male, 255.0 for female
230
+
231
+ """
232
+ if f0_method == "crepe":
233
+ model = CREPE(
234
+ device=self.device, sample_rate=self.sample_rate, hop_size=self.window
235
+ )
236
+ f0 = model.get_f0(x, self.f0_min, self.f0_max, p_len, "full")
237
+ del model
238
+ elif f0_method == "crepe-tiny":
239
+ model = CREPE(
240
+ device=self.device, sample_rate=self.sample_rate, hop_size=self.window
241
+ )
242
+ f0 = model.get_f0(x, self.f0_min, self.f0_max, p_len, "tiny")
243
+ del model
244
+ elif f0_method == "rmvpe":
245
+ model = RMVPE(
246
+ device=self.device, sample_rate=self.sample_rate, hop_size=self.window
247
+ )
248
+ f0 = model.get_f0(x, filter_radius=0.03)
249
+ del model
250
+ elif f0_method == "fcpe":
251
+ model = FCPE(
252
+ device=self.device, sample_rate=self.sample_rate, hop_size=self.window
253
+ )
254
+ f0 = model.get_f0(x, p_len, filter_radius=0.006)
255
+ del model
256
+
257
+ # f0 adjustments
258
+ if f0_autotune is True:
259
+ f0 = self.autotune.autotune_f0(f0, f0_autotune_strength)
260
+ elif proposed_pitch is True:
261
+ limit = 12
262
+ # calculate median f0 of the audio
263
+ valid_f0 = np.where(f0 > 0)[0]
264
+ if len(valid_f0) < 2:
265
+ # no valid f0 detected
266
+ up_key = 0
267
+ else:
268
+ median_f0 = float(
269
+ np.median(np.interp(np.arange(len(f0)), valid_f0, f0[valid_f0]))
270
+ )
271
+ if median_f0 <= 0 or np.isnan(median_f0):
272
+ up_key = 0
273
+ else:
274
+ # calculate proposed shift
275
+ up_key = max(
276
+ -limit,
277
+ min(
278
+ limit,
279
+ int(
280
+ np.round(
281
+ 12 * np.log2(proposed_pitch_threshold / median_f0)
282
+ )
283
+ ),
284
+ ),
285
+ )
286
+ logger.info("calculated pitch offset: %d", up_key)
287
+ f0 *= pow(2, (pitch + up_key) / 12)
288
+ else:
289
+ f0 *= pow(2, pitch / 12)
290
+ # quantizing f0 to 255 buckets to make coarse f0
291
+ f0bak = f0.copy()
292
+ f0_mel = 1127 * np.log(1 + f0 / 700)
293
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
294
+ self.f0_mel_max - self.f0_mel_min
295
+ ) + 1
296
+ f0_mel[f0_mel <= 1] = 1
297
+ f0_mel[f0_mel > 255] = 255
298
+ f0_coarse = np.rint(f0_mel).astype(int)
299
+
300
+ return f0_coarse, f0bak
301
+
302
+ def voice_conversion(
303
+ self,
304
+ model,
305
+ net_g,
306
+ sid,
307
+ audio0,
308
+ pitch,
309
+ pitchf,
310
+ index,
311
+ big_npy,
312
+ index_rate,
313
+ version,
314
+ protect,
315
+ ):
316
+ """
317
+ Performs voice conversion on a given audio segment.
318
+
319
+ Args:
320
+ model: The feature extractor model.
321
+ net_g: The generative model for synthesizing speech.
322
+ sid: Speaker ID for the target voice.
323
+ audio0: The input audio segment.
324
+ pitch: Quantized F0 contour for pitch guidance.
325
+ pitchf: Original F0 contour for pitch guidance.
326
+ index: FAISS index for speaker embedding retrieval.
327
+ big_npy: Speaker embeddings stored in a NumPy array.
328
+ index_rate: Blending rate for speaker embedding retrieval.
329
+ version: Model version (Keep to support old models).
330
+ protect: Protection level for preserving the original pitch.
331
+
332
+ """
333
+ with torch.no_grad():
334
+ pitch_guidance = pitch != None and pitchf != None
335
+ # prepare source audio
336
+ feats = torch.from_numpy(audio0).float()
337
+ feats = feats.mean(-1) if feats.dim() == 2 else feats
338
+ assert feats.dim() == 1, feats.dim()
339
+ feats = feats.view(1, -1).to(self.device)
340
+ # extract features
341
+ feats = model(feats)["last_hidden_state"]
342
+ feats = (
343
+ model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats
344
+ )
345
+ # make a copy for pitch guidance and protection
346
+ feats0 = feats.clone() if pitch_guidance else None
347
+ if (
348
+ index
349
+ ): # set by parent function, only true if index is available, loaded, and index rate > 0
350
+ feats = self._retrieve_speaker_embeddings(
351
+ feats,
352
+ index,
353
+ big_npy,
354
+ index_rate,
355
+ )
356
+ # feature upsampling
357
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(
358
+ 0,
359
+ 2,
360
+ 1,
361
+ )
362
+ # adjust the length if the audio is short
363
+ p_len = min(audio0.shape[0] // self.window, feats.shape[1])
364
+ if pitch_guidance:
365
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
366
+ 0,
367
+ 2,
368
+ 1,
369
+ )
370
+ pitch, pitchf = pitch[:, :p_len], pitchf[:, :p_len]
371
+ # Pitch protection blending
372
+ if protect < 0.5:
373
+ pitchff = pitchf.clone()
374
+ pitchff[pitchf > 0] = 1
375
+ pitchff[pitchf < 1] = protect
376
+ feats = feats * pitchff.unsqueeze(-1) + feats0 * (
377
+ 1 - pitchff.unsqueeze(-1)
378
+ )
379
+ feats = feats.to(feats0.dtype)
380
+ else:
381
+ pitch, pitchf = None, None
382
+ p_len = torch.tensor([p_len], device=self.device).long()
383
+ audio1 = (
384
+ (net_g.infer(feats.float(), p_len, pitch, pitchf.float(), sid)[0][0, 0])
385
+ .data.cpu()
386
+ .float()
387
+ .numpy()
388
+ )
389
+ # clean up
390
+ del feats, feats0, p_len
391
+ if torch.cuda.is_available():
392
+ torch.cuda.empty_cache()
393
+ return audio1
394
+
395
+ def _retrieve_speaker_embeddings(self, feats, index, big_npy, index_rate):
396
+ npy = feats[0].cpu().numpy()
397
+ score, ix = index.search(npy, k=8)
398
+ weight = np.square(1 / score)
399
+ weight /= weight.sum(axis=1, keepdims=True)
400
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
401
+ feats = (
402
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
403
+ + (1 - index_rate) * feats
404
+ )
405
+ return feats
406
+
407
+ def pipeline(
408
+ self,
409
+ model,
410
+ net_g,
411
+ sid,
412
+ audio,
413
+ pitch,
414
+ f0_method,
415
+ file_index,
416
+ index_rate,
417
+ pitch_guidance,
418
+ volume_envelope,
419
+ version,
420
+ protect,
421
+ f0_autotune,
422
+ f0_autotune_strength,
423
+ proposed_pitch,
424
+ proposed_pitch_threshold,
425
+ ):
426
+ """
427
+ The main pipeline function for performing voice conversion.
428
+
429
+ Args:
430
+ model: The feature extractor model.
431
+ net_g: The generative model for synthesizing speech.
432
+ sid: Speaker ID for the target voice.
433
+ audio: The input audio signal.
434
+ input_audio_path: Path to the input audio file.
435
+ pitch: Key to adjust the pitch of the F0 contour.
436
+ f0_method: Method to use for F0 estimation.
437
+ file_index: Path to the FAISS index file for speaker embedding retrieval.
438
+ index_rate: Blending rate for speaker embedding retrieval.
439
+ pitch_guidance: Whether to use pitch guidance during voice conversion.
440
+ tgt_sr: Target sampling rate for the output audio.
441
+ resample_sr: Resampling rate for the output audio.
442
+ version: Model version.
443
+ protect: Protection level for preserving the original pitch.
444
+ hop_length: Hop length for F0 estimation methods.
445
+ f0_autotune: Whether to apply autotune to the F0 contour.
446
+
447
+ """
448
+ if file_index != "" and pathlib.Path(file_index).exists() and index_rate > 0:
449
+ try:
450
+ index = faiss.read_index(file_index)
451
+ big_npy = index.reconstruct_n(0, index.ntotal)
452
+ except Exception as error:
453
+ print(f"An error occurred reading the FAISS index: {error}")
454
+ index = big_npy = None
455
+ else:
456
+ index = big_npy = None
457
+ audio = signal.filtfilt(bh, ah, audio)
458
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
459
+ opt_ts = []
460
+ if audio_pad.shape[0] > self.t_max:
461
+ audio_sum = np.zeros_like(audio)
462
+ for i in range(self.window):
463
+ audio_sum += audio_pad[i : i - self.window]
464
+ for t in range(self.t_center, audio.shape[0], self.t_center):
465
+ opt_ts.append(
466
+ t
467
+ - self.t_query
468
+ + np.where(
469
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
470
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min(),
471
+ )[0][0],
472
+ )
473
+ s = 0
474
+ audio_opt = []
475
+ t = None
476
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
477
+ p_len = audio_pad.shape[0] // self.window
478
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
479
+ if pitch_guidance:
480
+ pitch, pitchf = self.get_f0(
481
+ audio_pad,
482
+ p_len,
483
+ f0_method,
484
+ pitch,
485
+ f0_autotune,
486
+ f0_autotune_strength,
487
+ proposed_pitch,
488
+ proposed_pitch_threshold,
489
+ )
490
+ pitch = pitch[:p_len]
491
+ pitchf = pitchf[:p_len]
492
+ if self.device == "mps":
493
+ pitchf = pitchf.astype(np.float32)
494
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
495
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
496
+ for t in opt_ts:
497
+ t = t // self.window * self.window
498
+ if pitch_guidance:
499
+ audio_opt.append(
500
+ self.voice_conversion(
501
+ model,
502
+ net_g,
503
+ sid,
504
+ audio_pad[s : t + self.t_pad2 + self.window],
505
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
506
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
507
+ index,
508
+ big_npy,
509
+ index_rate,
510
+ version,
511
+ protect,
512
+ )[self.t_pad_tgt : -self.t_pad_tgt],
513
+ )
514
+ else:
515
+ audio_opt.append(
516
+ self.voice_conversion(
517
+ model,
518
+ net_g,
519
+ sid,
520
+ audio_pad[s : t + self.t_pad2 + self.window],
521
+ None,
522
+ None,
523
+ index,
524
+ big_npy,
525
+ index_rate,
526
+ version,
527
+ protect,
528
+ )[self.t_pad_tgt : -self.t_pad_tgt],
529
+ )
530
+ s = t
531
+ if pitch_guidance:
532
+ audio_opt.append(
533
+ self.voice_conversion(
534
+ model,
535
+ net_g,
536
+ sid,
537
+ audio_pad[t:],
538
+ pitch[:, t // self.window :] if t is not None else pitch,
539
+ pitchf[:, t // self.window :] if t is not None else pitchf,
540
+ index,
541
+ big_npy,
542
+ index_rate,
543
+ version,
544
+ protect,
545
+ )[self.t_pad_tgt : -self.t_pad_tgt],
546
+ )
547
+ else:
548
+ audio_opt.append(
549
+ self.voice_conversion(
550
+ model,
551
+ net_g,
552
+ sid,
553
+ audio_pad[t:],
554
+ None,
555
+ None,
556
+ index,
557
+ big_npy,
558
+ index_rate,
559
+ version,
560
+ protect,
561
+ )[self.t_pad_tgt : -self.t_pad_tgt],
562
+ )
563
+ audio_opt = np.concatenate(audio_opt)
564
+ if volume_envelope != 1:
565
+ audio_opt = AudioProcessor.change_rms(
566
+ audio,
567
+ self.sample_rate,
568
+ audio_opt,
569
+ self.tgt_sr,
570
+ volume_envelope,
571
+ )
572
+ audio_max = np.abs(audio_opt).max() / 0.99
573
+ if audio_max > 1:
574
+ audio_opt /= audio_max
575
+ if pitch_guidance:
576
+ del pitch, pitchf
577
+ del sid
578
+ if torch.cuda.is_available():
579
+ torch.cuda.empty_cache()
580
+ return audio_opt
581
+ t
rvc_logic/rvc/infer/typing_extra.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Extra type definitions for the `rvc_logic.rvc.infer` package."""
2
+
3
+ from typing import TypedDict
4
+
5
+
6
+ class ConvertAudioKwArgs(TypedDict, total=False):
7
+ """Keyword arguments for the `convert_audio` function."""
8
+
9
+ # pre-processing arguments
10
+ formant_shifting: bool
11
+ formant_qfrency: float
12
+ formant_timbre: float
13
+ # reverb post-processing arguments
14
+ reverb: bool
15
+ reverb_room_size: float
16
+ reverb_damping: float
17
+ reverb_wet_level: float
18
+ reverb_dry_level: float
19
+ reverb_width: float
20
+ reverb_freeze_mode: int
21
+ # pitch shift post-processing arguments
22
+ pitch_shift: bool
23
+ pitch_shift_semitones: int
24
+ # limiter post-processing arguments
25
+ limiter: bool
26
+ limiter_threshold: float
27
+ limiter_release: float
28
+ # gain post-processing arguments
29
+ gain: bool
30
+ gain_db: int
31
+ # distortion post-processing arguments
32
+ distortion: bool
33
+ distortion_gain: int
34
+ # chorus post-processing arguments
35
+ chorus: bool
36
+ chorus_rate: float
37
+ chorus_depth: float
38
+ chorus_delay: int
39
+ chorus_feedback: float
40
+ chorus_mix: float
41
+ # bitcrush post-processing arguments
42
+ bitcrush: bool
43
+ bitcrush_bit_depth: int
44
+ # clipping post-processing arguments
45
+ clipping: bool
46
+ clipping_threshold: int
47
+ # compressor post-processing arguments
48
+ compressor: bool
49
+ compressor_threshold: int
50
+ compressor_ratio: int
51
+ compressor_attack: float
52
+ compressor_release: int
53
+ # delay post-processing arguments
54
+ delay: bool
55
+ delay_seconds: float
56
+ delay_feedback: float
57
+ delay_mix: float
58
+ t
rvc_logic/rvc/lib/algorithm/__init__.py ADDED
File without changes
rvc_logic/rvc/lib/algorithm/attentions.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+
5
+ from rvc_logic.rvc.lib.algorithm.commons import convert_pad_shape
6
+
7
+
8
+ class MultiHeadAttention(torch.nn.Module):
9
+ """
10
+ Multi-head attention module with optional relative positional encoding and proximal bias.
11
+
12
+ Args:
13
+ channels (int): Number of input channels.
14
+ out_channels (int): Number of output channels.
15
+ n_heads (int): Number of attention heads.
16
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
17
+ window_size (int, optional): Window size for relative positional encoding. Defaults to None.
18
+ heads_share (bool, optional): Whether to share relative positional embeddings across heads. Defaults to True.
19
+ block_length (int, optional): Block length for local attention. Defaults to None.
20
+ proximal_bias (bool, optional): Whether to use proximal bias in self-attention. Defaults to False.
21
+ proximal_init (bool, optional): Whether to initialize the key projection weights the same as query projection weights. Defaults to False.
22
+
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ channels: int,
28
+ out_channels: int,
29
+ n_heads: int,
30
+ p_dropout: float = 0.0,
31
+ window_size: int = None,
32
+ heads_share: bool = True,
33
+ block_length: int = None,
34
+ proximal_bias: bool = False,
35
+ proximal_init: bool = False,
36
+ ):
37
+ super().__init__()
38
+ assert (
39
+ channels % n_heads == 0
40
+ ), "Channels must be divisible by the number of heads."
41
+
42
+ self.channels = channels
43
+ self.out_channels = out_channels
44
+ self.n_heads = n_heads
45
+ self.k_channels = channels // n_heads
46
+ self.window_size = window_size
47
+ self.block_length = block_length
48
+ self.proximal_bias = proximal_bias
49
+
50
+ # Define projections
51
+ self.conv_q = torch.nn.Conv1d(channels, channels, 1)
52
+ self.conv_k = torch.nn.Conv1d(channels, channels, 1)
53
+ self.conv_v = torch.nn.Conv1d(channels, channels, 1)
54
+ self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
55
+
56
+ self.drop = torch.nn.Dropout(p_dropout)
57
+
58
+ # Relative positional encodings
59
+ if window_size:
60
+ n_heads_rel = 1 if heads_share else n_heads
61
+ rel_stddev = self.k_channels**-0.5
62
+ self.emb_rel_k = torch.nn.Parameter(
63
+ torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels)
64
+ * rel_stddev,
65
+ )
66
+ self.emb_rel_v = torch.nn.Parameter(
67
+ torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels)
68
+ * rel_stddev,
69
+ )
70
+
71
+ # Initialize weights
72
+ torch.nn.init.xavier_uniform_(self.conv_q.weight)
73
+ torch.nn.init.xavier_uniform_(self.conv_k.weight)
74
+ torch.nn.init.xavier_uniform_(self.conv_v.weight)
75
+ torch.nn.init.xavier_uniform_(self.conv_o.weight)
76
+
77
+ if proximal_init:
78
+ with torch.no_grad():
79
+ self.conv_k.weight.copy_(self.conv_q.weight)
80
+ self.conv_k.bias.copy_(self.conv_q.bias)
81
+
82
+ def forward(self, x, c, attn_mask=None):
83
+ # Compute query, key, value projections
84
+ q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c)
85
+
86
+ # Compute attention
87
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
88
+
89
+ # Final output projection
90
+ return self.conv_o(x)
91
+
92
+ def attention(self, query, key, value, mask=None):
93
+ # Reshape and compute scaled dot-product attention
94
+ b, d, t_s, t_t = (*key.size(), query.size(2))
95
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
96
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
97
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
98
+
99
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
100
+
101
+ if self.window_size:
102
+ assert t_s == t_t, "Relative attention only supports self-attention."
103
+ scores += self._compute_relative_scores(query, t_s)
104
+
105
+ if self.proximal_bias:
106
+ assert t_s == t_t, "Proximal bias only supports self-attention."
107
+ scores += self._attention_bias_proximal(t_s).to(scores.device, scores.dtype)
108
+
109
+ if mask is not None:
110
+ scores = scores.masked_fill(mask == 0, -1e4)
111
+ if self.block_length:
112
+ block_mask = (
113
+ torch.ones_like(scores)
114
+ .triu(-self.block_length)
115
+ .tril(self.block_length)
116
+ )
117
+ scores = scores.masked_fill(block_mask == 0, -1e4)
118
+
119
+ # Apply softmax and dropout
120
+ p_attn = self.drop(torch.nn.functional.softmax(scores, dim=-1))
121
+
122
+ # Compute attention output
123
+ output = torch.matmul(p_attn, value)
124
+
125
+ if self.window_size:
126
+ output += self._apply_relative_values(p_attn, t_s)
127
+
128
+ return output.transpose(2, 3).contiguous().view(b, d, t_t), p_attn
129
+
130
+ def _compute_relative_scores(self, query, length):
131
+ rel_emb = self._get_relative_embeddings(self.emb_rel_k, length)
132
+ rel_logits = self._matmul_with_relative_keys(
133
+ query / math.sqrt(self.k_channels),
134
+ rel_emb,
135
+ )
136
+ return self._relative_position_to_absolute_position(rel_logits)
137
+
138
+ def _apply_relative_values(self, p_attn, length):
139
+ rel_weights = self._absolute_position_to_relative_position(p_attn)
140
+ rel_emb = self._get_relative_embeddings(self.emb_rel_v, length)
141
+ return self._matmul_with_relative_values(rel_weights, rel_emb)
142
+
143
+ # Helper methods
144
+ def _matmul_with_relative_values(self, x, y):
145
+ return torch.matmul(x, y.unsqueeze(0))
146
+
147
+ def _matmul_with_relative_keys(self, x, y):
148
+ return torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
149
+
150
+ def _get_relative_embeddings(self, embeddings, length):
151
+ pad_length = max(length - (self.window_size + 1), 0)
152
+ start = max((self.window_size + 1) - length, 0)
153
+ end = start + 2 * length - 1
154
+
155
+ if pad_length > 0:
156
+ embeddings = torch.nn.functional.pad(
157
+ embeddings,
158
+ convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
159
+ )
160
+ return embeddings[:, start:end]
161
+
162
+ def _relative_position_to_absolute_position(self, x):
163
+ batch, heads, length, _ = x.size()
164
+ x = torch.nn.functional.pad(
165
+ x,
166
+ convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]),
167
+ )
168
+ x_flat = x.view(batch, heads, length * 2 * length)
169
+ x_flat = torch.nn.functional.pad(
170
+ x_flat,
171
+ convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]),
172
+ )
173
+ return x_flat.view(batch, heads, length + 1, 2 * length - 1)[
174
+ :,
175
+ :,
176
+ :length,
177
+ length - 1 :,
178
+ ]
179
+
180
+ def _absolute_position_to_relative_position(self, x):
181
+ batch, heads, length, _ = x.size()
182
+ x = torch.nn.functional.pad(
183
+ x,
184
+ convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]),
185
+ )
186
+ x_flat = x.view(batch, heads, length**2 + length * (length - 1))
187
+ x_flat = torch.nn.functional.pad(
188
+ x_flat,
189
+ convert_pad_shape([[0, 0], [0, 0], [length, 0]]),
190
+ )
191
+ return x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:]
192
+
193
+ def _attention_bias_proximal(self, length):
194
+ r = torch.arange(length, dtype=torch.float32)
195
+ diff = r.unsqueeze(0) - r.unsqueeze(1)
196
+ return -torch.log1p(torch.abs(diff)).unsqueeze(0).unsqueeze(0)
197
+
198
+
199
+ class FFN(torch.nn.Module):
200
+ """
201
+ Feed-forward network module.
202
+
203
+ Args:
204
+ in_channels (int): Number of input channels.
205
+ out_channels (int): Number of output channels.
206
+ filter_channels (int): Number of filter channels in the convolution layers.
207
+ kernel_size (int): Kernel size of the convolution layers.
208
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
209
+ activation (str, optional): Activation function to use. Defaults to None.
210
+ causal (bool, optional): Whether to use causal padding in the convolution layers. Defaults to False.
211
+
212
+ """
213
+
214
+ def __init__(
215
+ self,
216
+ in_channels: int,
217
+ out_channels: int,
218
+ filter_channels: int,
219
+ kernel_size: int,
220
+ p_dropout: float = 0.0,
221
+ activation: str = None,
222
+ causal: bool = False,
223
+ ):
224
+ super().__init__()
225
+ self.padding_fn = self._causal_padding if causal else self._same_padding
226
+
227
+ self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size)
228
+ self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size)
229
+ self.drop = torch.nn.Dropout(p_dropout)
230
+
231
+ self.activation = activation
232
+
233
+ def forward(self, x, x_mask):
234
+ x = self.conv_1(self.padding_fn(x * x_mask))
235
+ x = self._apply_activation(x)
236
+ x = self.drop(x)
237
+ x = self.conv_2(self.padding_fn(x * x_mask))
238
+ return x * x_mask
239
+
240
+ def _apply_activation(self, x):
241
+ if self.activation == "gelu":
242
+ return x * torch.sigmoid(1.702 * x)
243
+ return torch.relu(x)
244
+
245
+ def _causal_padding(self, x):
246
+ pad_l, pad_r = self.conv_1.kernel_size[0] - 1, 0
247
+ return torch.nn.functional.pad(
248
+ x,
249
+ convert_pad_shape([[0, 0], [0, 0], [pad_l, pad_r]]),
250
+ )
251
+
252
+ def _same_padding(self, x):
253
+ pad = (self.conv_1.kernel_size[0] - 1) // 2
254
+ return torch.nn.functional.pad(
255
+ x,
256
+ convert_pad_shape([[0, 0], [0, 0], [pad, pad]]),
257
+ )
258
+ )
rvc_logic/rvc/lib/algorithm/commons.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+
5
+
6
+ def init_weights(m, mean=0.0, std=0.01):
7
+ """
8
+ Initialize the weights of a module.
9
+
10
+ Args:
11
+ m: The module to initialize.
12
+ mean: The mean of the normal distribution.
13
+ std: The standard deviation of the normal distribution.
14
+
15
+ """
16
+ classname = m.__class__.__name__
17
+ if classname.find("Conv") != -1:
18
+ m.weight.data.normal_(mean, std)
19
+
20
+
21
+ def get_padding(kernel_size, dilation=1):
22
+ """
23
+ Calculate the padding needed for a convolution.
24
+
25
+ Args:
26
+ kernel_size: The size of the kernel.
27
+ dilation: The dilation of the convolution.
28
+
29
+ """
30
+ return int((kernel_size * dilation - dilation) / 2)
31
+
32
+
33
+ def convert_pad_shape(pad_shape):
34
+ """
35
+ Convert the pad shape to a list of integers.
36
+
37
+ Args:
38
+ pad_shape: The pad shape..
39
+
40
+ """
41
+ l = pad_shape[::-1]
42
+ pad_shape = [item for sublist in l for item in sublist]
43
+ return pad_shape
44
+
45
+
46
+ def slice_segments(
47
+ x: torch.Tensor,
48
+ ids_str: torch.Tensor,
49
+ segment_size: int = 4,
50
+ dim: int = 2,
51
+ ):
52
+ """
53
+ Slice segments from a tensor, handling tensors with different numbers of dimensions.
54
+
55
+ Args:
56
+ x (torch.Tensor): The tensor to slice.
57
+ ids_str (torch.Tensor): The starting indices of the segments.
58
+ segment_size (int, optional): The size of each segment. Defaults to 4.
59
+ dim (int, optional): The dimension to slice across (2D or 3D tensors). Defaults to 2.
60
+
61
+ """
62
+ if dim == 2:
63
+ ret = torch.zeros_like(x[:, :segment_size])
64
+ elif dim == 3:
65
+ ret = torch.zeros_like(x[:, :, :segment_size])
66
+
67
+ for i in range(x.size(0)):
68
+ idx_str = ids_str[i].item()
69
+ idx_end = idx_str + segment_size
70
+ if dim == 2:
71
+ ret[i] = x[i, idx_str:idx_end]
72
+ else:
73
+ ret[i] = x[i, :, idx_str:idx_end]
74
+
75
+ return ret
76
+
77
+
78
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
79
+ """
80
+ Randomly slice segments from a tensor.
81
+
82
+ Args:
83
+ x: The tensor to slice.
84
+ x_lengths: The lengths of the sequences.
85
+ segment_size: The size of each segment.
86
+
87
+ """
88
+ b, d, t = x.size()
89
+ if x_lengths is None:
90
+ x_lengths = t
91
+ ids_str_max = x_lengths - segment_size + 1
92
+ ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long)
93
+ ret = slice_segments(x, ids_str, segment_size, dim=3)
94
+ return ret, ids_str
95
+
96
+
97
+ @torch.jit.script
98
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
99
+ """
100
+ Fused add tanh sigmoid multiply operation.
101
+
102
+ Args:
103
+ input_a: The first input tensor.
104
+ input_b: The second input tensor.
105
+ n_channels: The number of channels.
106
+
107
+ """
108
+ n_channels_int = n_channels[0]
109
+ in_act = input_a + input_b
110
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
111
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
112
+ acts = t_act * s_act
113
+ return acts
114
+
115
+
116
+ def sequence_mask(length: torch.Tensor, max_length: int | None = None):
117
+ """
118
+ Generate a sequence mask.
119
+
120
+ Args:
121
+ length: The lengths of the sequences.
122
+ max_length: The maximum length of the sequences.
123
+
124
+ """
125
+ if max_length is None:
126
+ max_length = length.max()
127
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
128
+ return x.unsqueeze(0) < length.unsqueeze(1)
129
+
130
+
131
+ def grad_norm(parameters, norm_type: float = 2.0):
132
+ """
133
+ Calculates norm of parameter gradients
134
+
135
+ Args:
136
+ parameters: The list of parameters to clip.
137
+ norm_type: The type of norm to use for clipping.
138
+
139
+ """
140
+ if isinstance(parameters, torch.Tensor):
141
+ parameters = [parameters]
142
+
143
+ parameters = [p for p in parameters if p.grad is not None]
144
+
145
+ if not parameters:
146
+ return 0.0
147
+
148
+ return torch.linalg.vector_norm(
149
+ torch.stack([p.grad.norm(norm_type) for p in parameters]),
150
+ ord=norm_type,
151
+ ).item()
rvc_logic/rvc/lib/algorithm/discriminators.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from torch.nn.utils.parametrizations import spectral_norm, weight_norm
4
+ from torch.utils.checkpoint import checkpoint
5
+
6
+ from rvc_logic.rvc.lib.algorithm.commons import get_padding
7
+ from rvc_logic.rvc.lib.algorithm.residuals import LRELU_SLOPE
8
+
9
+
10
+ class MultiPeriodDiscriminator(torch.nn.Module):
11
+ """
12
+ Multi-period discriminator.
13
+
14
+ This class implements a multi-period discriminator, which is used to
15
+ discriminate between real and fake audio signals. The discriminator
16
+ is composed of a series of convolutional layers that are applied to
17
+ the input signal at different periods.
18
+
19
+ Args:
20
+ use_spectral_norm (bool): Whether to use spectral normalization.
21
+ Defaults to False.
22
+
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ use_spectral_norm: bool = False,
28
+ checkpointing: bool = False,
29
+ version: str = "v2",
30
+ ):
31
+ super().__init__()
32
+
33
+ if version == "v1":
34
+ periods = [2, 3, 5, 7, 11, 17]
35
+ resolutions = []
36
+ elif version == "v2":
37
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
38
+ resolutions = []
39
+ elif version == "v3":
40
+ periods = [2, 3, 5, 7, 11]
41
+ resolutions = [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]]
42
+
43
+ self.checkpointing = checkpointing
44
+ self.discriminators = torch.nn.ModuleList(
45
+ [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
46
+ + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods]
47
+ + [
48
+ DiscriminatorR(r, use_spectral_norm=use_spectral_norm)
49
+ for r in resolutions
50
+ ],
51
+ )
52
+
53
+ def forward(self, y, y_hat):
54
+ y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
55
+ for d in self.discriminators:
56
+ if self.training and self.checkpointing:
57
+ y_d_r, fmap_r = checkpoint(d, y, use_reentrant=False)
58
+ y_d_g, fmap_g = checkpoint(d, y_hat, use_reentrant=False)
59
+ else:
60
+ y_d_r, fmap_r = d(y)
61
+ y_d_g, fmap_g = d(y_hat)
62
+ y_d_rs.append(y_d_r)
63
+ y_d_gs.append(y_d_g)
64
+ fmap_rs.append(fmap_r)
65
+ fmap_gs.append(fmap_g)
66
+
67
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
68
+
69
+
70
+ class DiscriminatorS(torch.nn.Module):
71
+ """
72
+ Discriminator for the short-term component.
73
+
74
+ This class implements a discriminator for the short-term component
75
+ of the audio signal. The discriminator is composed of a series of
76
+ convolutional layers that are applied to the input signal.
77
+ """
78
+
79
+ def __init__(self, use_spectral_norm: bool = False):
80
+ super().__init__()
81
+
82
+ norm_f = spectral_norm if use_spectral_norm else weight_norm
83
+ self.convs = torch.nn.ModuleList(
84
+ [
85
+ norm_f(torch.nn.Conv1d(1, 16, 15, 1, padding=7)),
86
+ norm_f(torch.nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)),
87
+ norm_f(torch.nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)),
88
+ norm_f(torch.nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
89
+ norm_f(torch.nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
90
+ norm_f(torch.nn.Conv1d(1024, 1024, 5, 1, padding=2)),
91
+ ],
92
+ )
93
+ self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1))
94
+ self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE)
95
+
96
+ def forward(self, x):
97
+ fmap = []
98
+ for conv in self.convs:
99
+ x = self.lrelu(conv(x))
100
+ fmap.append(x)
101
+ x = self.conv_post(x)
102
+ fmap.append(x)
103
+ x = torch.flatten(x, 1, -1)
104
+ return x, fmap
105
+
106
+
107
+ class DiscriminatorP(torch.nn.Module):
108
+ """
109
+ Discriminator for the long-term component.
110
+
111
+ This class implements a discriminator for the long-term component
112
+ of the audio signal. The discriminator is composed of a series of
113
+ convolutional layers that are applied to the input signal at a given
114
+ period.
115
+
116
+ Args:
117
+ period (int): Period of the discriminator.
118
+ kernel_size (int): Kernel size of the convolutional layers. Defaults to 5.
119
+ stride (int): Stride of the convolutional layers. Defaults to 3.
120
+ use_spectral_norm (bool): Whether to use spectral normalization. Defaults to False.
121
+
122
+ """
123
+
124
+ def __init__(
125
+ self,
126
+ period: int,
127
+ kernel_size: int = 5,
128
+ stride: int = 3,
129
+ use_spectral_norm: bool = False,
130
+ ):
131
+ super().__init__()
132
+ self.period = period
133
+ norm_f = spectral_norm if use_spectral_norm else weight_norm
134
+
135
+ in_channels = [1, 32, 128, 512, 1024]
136
+ out_channels = [32, 128, 512, 1024, 1024]
137
+ strides = [3, 3, 3, 3, 1]
138
+
139
+ self.convs = torch.nn.ModuleList(
140
+ [
141
+ norm_f(
142
+ torch.nn.Conv2d(
143
+ in_ch,
144
+ out_ch,
145
+ (kernel_size, 1),
146
+ (s, 1),
147
+ padding=(get_padding(kernel_size, 1), 0),
148
+ ),
149
+ )
150
+ for in_ch, out_ch, s in zip(
151
+ in_channels, out_channels, strides, strict=False
152
+ )
153
+ ],
154
+ )
155
+
156
+ self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
157
+ self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE)
158
+
159
+ def forward(self, x):
160
+ fmap = []
161
+ b, c, t = x.shape
162
+ if t % self.period != 0:
163
+ n_pad = self.period - (t % self.period)
164
+ x = torch.nn.functional.pad(x, (0, n_pad), "reflect")
165
+ x = x.view(b, c, -1, self.period)
166
+
167
+ for conv in self.convs:
168
+ x = self.lrelu(conv(x))
169
+ fmap.append(x)
170
+ x = self.conv_post(x)
171
+ fmap.append(x)
172
+ x = torch.flatten(x, 1, -1)
173
+ return x, fmap
174
+
175
+
176
+ class DiscriminatorR(torch.nn.Module):
177
+ def __init__(self, resolution, use_spectral_norm=False):
178
+ super().__init__()
179
+
180
+ self.resolution = resolution
181
+ self.lrelu_slope = 0.1
182
+ norm_f = spectral_norm if use_spectral_norm else weight_norm
183
+
184
+ self.convs = torch.nn.ModuleList(
185
+ [
186
+ norm_f(
187
+ torch.nn.Conv2d(
188
+ 1,
189
+ 32,
190
+ (3, 9),
191
+ padding=(1, 4),
192
+ )
193
+ ),
194
+ norm_f(
195
+ torch.nn.Conv2d(
196
+ 32,
197
+ 32,
198
+ (3, 9),
199
+ stride=(1, 2),
200
+ padding=(1, 4),
201
+ )
202
+ ),
203
+ norm_f(
204
+ torch.nn.Conv2d(
205
+ 32,
206
+ 32,
207
+ (3, 9),
208
+ stride=(1, 2),
209
+ padding=(1, 4),
210
+ )
211
+ ),
212
+ norm_f(
213
+ torch.nn.Conv2d(
214
+ 32,
215
+ 32,
216
+ (3, 9),
217
+ stride=(1, 2),
218
+ padding=(1, 4),
219
+ )
220
+ ),
221
+ norm_f(
222
+ torch.nn.Conv2d(
223
+ 32,
224
+ 32,
225
+ (3, 3),
226
+ padding=(1, 1),
227
+ )
228
+ ),
229
+ ]
230
+ )
231
+ self.conv_post = norm_f(torch.nn.Conv2d(32, 1, (3, 3), padding=(1, 1)))
232
+
233
+ def forward(self, x):
234
+ fmap = []
235
+
236
+ x = self.spectrogram(x).unsqueeze(1)
237
+
238
+ for layer in self.convs:
239
+ x = F.leaky_relu(layer(x), self.lrelu_slope)
240
+ fmap.append(x)
241
+ x = self.conv_post(x)
242
+ fmap.append(x)
243
+
244
+ return torch.flatten(x, 1, -1), fmap
245
+
246
+ def spectrogram(self, x):
247
+ n_fft, hop_length, win_length = self.resolution
248
+ pad = int((n_fft - hop_length) / 2)
249
+ x = F.pad(
250
+ x,
251
+ (pad, pad),
252
+ mode="reflect",
253
+ ).squeeze(1)
254
+ x = torch.stft(
255
+ x,
256
+ n_fft=n_fft,
257
+ hop_length=hop_length,
258
+ win_length=win_length,
259
+ window=torch.ones(win_length, device=x.device),
260
+ center=False,
261
+ return_complex=True,
262
+ )
263
+
264
+ mag = torch.norm(torch.view_as_real(x), p=2, dim=-1) # [B, F, TT]
265
+
266
+ return mag
267
+ mag
rvc_logic/rvc/lib/algorithm/encoders.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import math
3
+
4
+ import torch
5
+
6
+ from rvc_logic.rvc.lib.algorithm.attentions import FFN, MultiHeadAttention
7
+ from rvc_logic.rvc.lib.algorithm.commons import sequence_mask
8
+ from rvc_logic.rvc.lib.algorithm.modules import WaveNet
9
+ from rvc_logic.rvc.lib.algorithm.normalization import LayerNorm
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class Encoder(torch.nn.Module):
15
+ """
16
+ Encoder module for the Transformer model.
17
+
18
+ Args:
19
+ hidden_channels (int): Number of hidden channels in the encoder.
20
+ filter_channels (int): Number of filter channels in the feed-forward network.
21
+ n_heads (int): Number of attention heads.
22
+ n_layers (int): Number of encoder layers.
23
+ kernel_size (int, optional): Kernel size of the convolution layers in the feed-forward network. Defaults to 1.
24
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
25
+ window_size (int, optional): Window size for relative positional encoding. Defaults to 10.
26
+
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ hidden_channels: int,
32
+ filter_channels: int,
33
+ n_heads: int,
34
+ n_layers: int,
35
+ kernel_size: int = 1,
36
+ p_dropout: float = 0.0,
37
+ window_size: int = 10,
38
+ ):
39
+ super().__init__()
40
+
41
+ self.hidden_channels = hidden_channels
42
+ self.n_layers = n_layers
43
+ self.drop = torch.nn.Dropout(p_dropout)
44
+
45
+ self.attn_layers = torch.nn.ModuleList(
46
+ [
47
+ MultiHeadAttention(
48
+ hidden_channels,
49
+ hidden_channels,
50
+ n_heads,
51
+ p_dropout=p_dropout,
52
+ window_size=window_size,
53
+ )
54
+ for _ in range(n_layers)
55
+ ],
56
+ )
57
+ self.norm_layers_1 = torch.nn.ModuleList(
58
+ [LayerNorm(hidden_channels) for _ in range(n_layers)],
59
+ )
60
+ self.ffn_layers = torch.nn.ModuleList(
61
+ [
62
+ FFN(
63
+ hidden_channels,
64
+ hidden_channels,
65
+ filter_channels,
66
+ kernel_size,
67
+ p_dropout=p_dropout,
68
+ )
69
+ for _ in range(n_layers)
70
+ ],
71
+ )
72
+ self.norm_layers_2 = torch.nn.ModuleList(
73
+ [LayerNorm(hidden_channels) for _ in range(n_layers)],
74
+ )
75
+
76
+ def forward(self, x, x_mask):
77
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
78
+ x = x * x_mask
79
+
80
+ for i in range(self.n_layers):
81
+ y = self.attn_layers[i](x, x, attn_mask)
82
+ y = self.drop(y)
83
+ x = self.norm_layers_1[i](x + y)
84
+
85
+ y = self.ffn_layers[i](x, x_mask)
86
+ y = self.drop(y)
87
+ x = self.norm_layers_2[i](x + y)
88
+
89
+ return x * x_mask
90
+
91
+
92
+ class TextEncoder(torch.nn.Module):
93
+ """
94
+ Text Encoder with configurable embedding dimension.
95
+
96
+ Args:
97
+ out_channels (int): Output channels of the encoder.
98
+ hidden_channels (int): Hidden channels of the encoder.
99
+ filter_channels (int): Filter channels of the encoder.
100
+ n_heads (int): Number of attention heads.
101
+ n_layers (int): Number of encoder layers.
102
+ kernel_size (int): Kernel size of the convolutional layers.
103
+ p_dropout (float): Dropout probability.
104
+ embedding_dim (int): Embedding dimension for phone embeddings (v1 = 256, v2 = 768).
105
+ f0 (bool, optional): Whether to use F0 embedding. Defaults to True.
106
+
107
+ """
108
+
109
+ def __init__(
110
+ self,
111
+ out_channels: int,
112
+ hidden_channels: int,
113
+ filter_channels: int,
114
+ n_heads: int,
115
+ n_layers: int,
116
+ kernel_size: int,
117
+ p_dropout: float,
118
+ embedding_dim: int,
119
+ f0: bool = True,
120
+ ):
121
+ super().__init__()
122
+ self.hidden_channels = hidden_channels
123
+ self.out_channels = out_channels
124
+ self.emb_phone = torch.nn.Linear(embedding_dim, hidden_channels)
125
+ self.lrelu = torch.nn.LeakyReLU(0.1, inplace=True)
126
+ self.emb_pitch = torch.nn.Embedding(256, hidden_channels) if f0 else None
127
+ logger.info("hidden_channels: %d", hidden_channels)
128
+
129
+ self.encoder = Encoder(
130
+ hidden_channels,
131
+ filter_channels,
132
+ n_heads,
133
+ n_layers,
134
+ kernel_size,
135
+ p_dropout,
136
+ )
137
+ self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
138
+
139
+ def forward(
140
+ self,
141
+ phone: torch.Tensor,
142
+ pitch: torch.Tensor | None,
143
+ lengths: torch.Tensor,
144
+ ):
145
+ x = self.emb_phone(phone)
146
+ if pitch is not None and self.emb_pitch:
147
+ x += self.emb_pitch(pitch)
148
+
149
+ x *= math.sqrt(self.hidden_channels)
150
+ x = self.lrelu(x)
151
+ x = x.transpose(1, -1) # [B, H, T]
152
+
153
+ x_mask = sequence_mask(lengths, x.size(2)).unsqueeze(1).to(x.dtype)
154
+ x = self.encoder(x, x_mask)
155
+ stats = self.proj(x) * x_mask
156
+
157
+ m, logs = torch.split(stats, self.out_channels, dim=1)
158
+ return m, logs, x_mask
159
+
160
+
161
+ class PosteriorEncoder(torch.nn.Module):
162
+ """
163
+ Posterior Encoder for inferring latent representation.
164
+
165
+ Args:
166
+ in_channels (int): Number of channels in the input.
167
+ out_channels (int): Number of channels in the output.
168
+ hidden_channels (int): Number of hidden channels in the encoder.
169
+ kernel_size (int): Kernel size of the convolutional layers.
170
+ dilation_rate (int): Dilation rate of the convolutional layers.
171
+ n_layers (int): Number of layers in the encoder.
172
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
173
+
174
+ """
175
+
176
+ def __init__(
177
+ self,
178
+ in_channels: int,
179
+ out_channels: int,
180
+ hidden_channels: int,
181
+ kernel_size: int,
182
+ dilation_rate: int,
183
+ n_layers: int,
184
+ gin_channels: int = 0,
185
+ ):
186
+ super().__init__()
187
+ self.out_channels = out_channels
188
+ self.pre = torch.nn.Conv1d(in_channels, hidden_channels, 1)
189
+ self.enc = WaveNet(
190
+ hidden_channels,
191
+ kernel_size,
192
+ dilation_rate,
193
+ n_layers,
194
+ gin_channels=gin_channels,
195
+ )
196
+ self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
197
+
198
+ def forward(
199
+ self,
200
+ x: torch.Tensor,
201
+ x_lengths: torch.Tensor,
202
+ g: torch.Tensor | None = None,
203
+ ):
204
+ x_mask = sequence_mask(x_lengths, x.size(2)).unsqueeze(1).to(x.dtype)
205
+
206
+ x = self.pre(x) * x_mask
207
+ x = self.enc(x, x_mask, g=g)
208
+
209
+ stats = self.proj(x) * x_mask
210
+ m, logs = torch.split(stats, self.out_channels, dim=1)
211
+
212
+ z = m + torch.randn_like(m) * torch.exp(logs)
213
+ z *= x_mask
214
+
215
+ return z, m, logs, x_mask
216
+
217
+ def remove_weight_norm(self):
218
+ self.enc.remove_weight_norm()
219
+
220
+ def __prepare_scriptable__(self):
221
+ for hook in self.enc._forward_pre_hooks.values():
222
+ if (
223
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
224
+ and hook.__class__.__name__ == "WeightNorm"
225
+ ):
226
+ torch.nn.utils.remove_weight_norm(self.enc)
227
+ return self
228
+ eturn self
rvc_logic/rvc/lib/algorithm/generators/__init__.py ADDED
File without changes
rvc_logic/rvc/lib/algorithm/generators/hifigan.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import numpy as np
4
+
5
+ import torch
6
+ from torch.nn.utils import remove_weight_norm
7
+ from torch.nn.utils.parametrizations import weight_norm
8
+
9
+ from rvc_logic.rvc.lib.algorithm.commons import init_weights
10
+ from rvc_logic.rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock
11
+
12
+
13
+ class HiFiGANGenerator(torch.nn.Module):
14
+ """
15
+ HiFi-GAN Generator module for audio synthesis.
16
+
17
+ This module implements the generator part of the HiFi-GAN architecture,
18
+ which uses transposed convolutions for upsampling and residual blocks for
19
+ refining the audio output. It can also incorporate global conditioning.
20
+
21
+ Args:
22
+ initial_channel (int): Number of input channels to the initial convolutional layer.
23
+ resblock_kernel_sizes (list): List of kernel sizes for the residual blocks.
24
+ resblock_dilation_sizes (list): List of lists of dilation rates for the residual blocks, corresponding to each kernel size.
25
+ upsample_rates (list): List of upsampling factors for each upsampling layer.
26
+ upsample_initial_channel (int): Number of output channels from the initial convolutional layer, which is also the input to the first upsampling layer.
27
+ upsample_kernel_sizes (list): List of kernel sizes for the transposed convolutional layers used for upsampling.
28
+ gin_channels (int, optional): Number of input channels for the global conditioning. If 0, no global conditioning is used. Defaults to 0.
29
+
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ initial_channel: int,
35
+ resblock_kernel_sizes: list,
36
+ resblock_dilation_sizes: list,
37
+ upsample_rates: list,
38
+ upsample_initial_channel: int,
39
+ upsample_kernel_sizes: list,
40
+ gin_channels: int = 0,
41
+ ):
42
+ super().__init__()
43
+ self.num_kernels = len(resblock_kernel_sizes)
44
+ self.num_upsamples = len(upsample_rates)
45
+ self.conv_pre = torch.nn.Conv1d(
46
+ initial_channel,
47
+ upsample_initial_channel,
48
+ 7,
49
+ 1,
50
+ padding=3,
51
+ )
52
+
53
+ self.ups = torch.nn.ModuleList()
54
+ self.resblocks = torch.nn.ModuleList()
55
+
56
+ for i, (u, k) in enumerate(
57
+ zip(upsample_rates, upsample_kernel_sizes, strict=False),
58
+ ):
59
+ self.ups.append(
60
+ weight_norm(
61
+ torch.nn.ConvTranspose1d(
62
+ upsample_initial_channel // (2**i),
63
+ upsample_initial_channel // (2 ** (i + 1)),
64
+ k,
65
+ u,
66
+ padding=(k - u) // 2,
67
+ ),
68
+ ),
69
+ )
70
+ ch = upsample_initial_channel // (2 ** (i + 1))
71
+ for j, (k, d) in enumerate(
72
+ zip(resblock_kernel_sizes, resblock_dilation_sizes, strict=False),
73
+ ):
74
+ self.resblocks.append(ResBlock(ch, k, d))
75
+
76
+ self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
77
+ self.ups.apply(init_weights)
78
+
79
+ if gin_channels != 0:
80
+ self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
81
+
82
+ def forward(self, x: torch.Tensor, g: torch.Tensor | None = None):
83
+ # new tensor
84
+ x = self.conv_pre(x)
85
+
86
+ if g is not None:
87
+ x = x + self.cond(g)
88
+
89
+ for i in range(self.num_upsamples):
90
+ x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
91
+ x = self.ups[i](x)
92
+ xs = None
93
+ for j in range(self.num_kernels):
94
+ if xs is None:
95
+ xs = self.resblocks[i * self.num_kernels + j](x)
96
+ else:
97
+ xs += self.resblocks[i * self.num_kernels + j](x)
98
+ x = xs / self.num_kernels
99
+ # in-place call
100
+ x = torch.nn.functional.leaky_relu(x)
101
+ x = self.conv_post(x)
102
+ # in-place call
103
+ x = torch.tanh(x)
104
+
105
+ return x
106
+
107
+ def __prepare_scriptable__(self):
108
+ for l in self.ups_and_resblocks:
109
+ for hook in l._forward_pre_hooks.values():
110
+ if (
111
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
112
+ and hook.__class__.__name__ == "WeightNorm"
113
+ ):
114
+ torch.nn.utils.remove_weight_norm(l)
115
+ return self
116
+
117
+ def remove_weight_norm(self):
118
+ for l in self.ups:
119
+ remove_weight_norm(l)
120
+ for l in self.resblocks:
121
+ l.remove_weight_norm()
122
+
123
+
124
+ class SineGenerator(torch.nn.Module):
125
+ """
126
+ Sine wave generator with optional harmonic overtones and noise.
127
+
128
+ This module generates sine waves for a fundamental frequency and its harmonics.
129
+ It can also add Gaussian noise and apply a voiced/unvoiced mask.
130
+
131
+ Args:
132
+ sampling_rate (int): The sampling rate of the audio in Hz.
133
+ num_harmonics (int, optional): The number of harmonic overtones to generate. Defaults to 0.
134
+ sine_amplitude (float, optional): The amplitude of the sine wave components. Defaults to 0.1.
135
+ noise_stddev (float, optional): The standard deviation of the additive Gaussian noise. Defaults to 0.003.
136
+ voiced_threshold (float, optional): The threshold for the fundamental frequency (F0) to determine if a frame is voiced. Defaults to 0.0.
137
+
138
+ """
139
+
140
+ def __init__(
141
+ self,
142
+ sampling_rate: int,
143
+ num_harmonics: int = 0,
144
+ sine_amplitude: float = 0.1,
145
+ noise_stddev: float = 0.003,
146
+ voiced_threshold: float = 0.0,
147
+ ):
148
+ super().__init__()
149
+ self.sampling_rate = sampling_rate
150
+ self.num_harmonics = num_harmonics
151
+ self.sine_amplitude = sine_amplitude
152
+ self.noise_stddev = noise_stddev
153
+ self.voiced_threshold = voiced_threshold
154
+ self.waveform_dim = self.num_harmonics + 1 # fundamental + harmonics
155
+
156
+ def _compute_voiced_unvoiced(self, f0: torch.Tensor):
157
+ """
158
+ Generates a binary mask indicating voiced/unvoiced frames based on the fundamental frequency.
159
+
160
+ Args:
161
+ f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length).
162
+
163
+ """
164
+ uv_mask = (f0 > self.voiced_threshold).float()
165
+ return uv_mask
166
+
167
+ def _generate_sine_wave(self, f0: torch.Tensor, upsampling_factor: int):
168
+ """
169
+ Generates sine waves for the fundamental frequency and its harmonics.
170
+
171
+ Args:
172
+ f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length, 1).
173
+ upsampling_factor (int): The factor by which to upsample the sine wave.
174
+
175
+ """
176
+ batch_size, length, _ = f0.shape
177
+
178
+ # Create an upsampling grid
179
+ upsampling_grid = torch.arange(
180
+ 1,
181
+ upsampling_factor + 1,
182
+ dtype=f0.dtype,
183
+ device=f0.device,
184
+ )
185
+
186
+ # Calculate phase increments
187
+ phase_increments = (f0 / self.sampling_rate) * upsampling_grid
188
+ phase_remainder = torch.fmod(phase_increments[:, :-1, -1:] + 0.5, 1.0) - 0.5
189
+ cumulative_phase = phase_remainder.cumsum(dim=1).fmod(1.0).to(f0.dtype)
190
+ phase_increments += torch.nn.functional.pad(
191
+ cumulative_phase,
192
+ (0, 0, 1, 0),
193
+ mode="constant",
194
+ )
195
+
196
+ # Reshape to match the sine wave shape
197
+ phase_increments = phase_increments.reshape(batch_size, -1, 1)
198
+
199
+ # Scale for harmonics
200
+ harmonic_scale = torch.arange(
201
+ 1,
202
+ self.waveform_dim + 1,
203
+ dtype=f0.dtype,
204
+ device=f0.device,
205
+ ).reshape(1, 1, -1)
206
+ phase_increments *= harmonic_scale
207
+
208
+ # Add random phase offset (except for the fundamental)
209
+ random_phase = torch.rand(1, 1, self.waveform_dim, device=f0.device)
210
+ random_phase[..., 0] = 0 # Fundamental frequency has no random offset
211
+ phase_increments += random_phase
212
+
213
+ # Generate sine waves
214
+ sine_waves = torch.sin(2 * np.pi * phase_increments)
215
+ return sine_waves
216
+
217
+ def forward(self, f0: torch.Tensor, upsampling_factor: int):
218
+ with torch.no_grad():
219
+ # Expand `f0` to include waveform dimensions
220
+ f0 = f0.unsqueeze(-1)
221
+
222
+ # Generate sine waves
223
+ sine_waves = (
224
+ self._generate_sine_wave(f0, upsampling_factor) * self.sine_amplitude
225
+ )
226
+
227
+ # Compute voiced/unvoiced mask
228
+ voiced_mask = self._compute_voiced_unvoiced(f0)
229
+
230
+ # Upsample voiced/unvoiced mask
231
+ voiced_mask = torch.nn.functional.interpolate(
232
+ voiced_mask.transpose(2, 1),
233
+ scale_factor=float(upsampling_factor),
234
+ mode="nearest",
235
+ ).transpose(2, 1)
236
+
237
+ # Compute noise amplitude
238
+ noise_amplitude = voiced_mask * self.noise_stddev + (1 - voiced_mask) * (
239
+ self.sine_amplitude / 3
240
+ )
241
+
242
+ # Add Gaussian noise
243
+ noise = noise_amplitude * torch.randn_like(sine_waves)
244
+
245
+ # Combine sine waves and noise
246
+ sine_waveforms = sine_waves * voiced_mask + noise
247
+
248
+ return sine_waveforms, voiced_mask, noise
249
+ oise
rvc_logic/rvc/lib/algorithm/generators/hifigan_mrf.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import math
4
+
5
+ import numpy as np
6
+
7
+ import torch
8
+ from torch.nn.utils import remove_weight_norm
9
+ from torch.nn.utils.parametrizations import weight_norm
10
+ from torch.utils.checkpoint import checkpoint
11
+
12
+ LRELU_SLOPE = 0.1
13
+
14
+
15
+ class MRFLayer(torch.nn.Module):
16
+ """
17
+ A single layer of the Multi-Receptive Field (MRF) block.
18
+
19
+ This layer consists of two 1D convolutional layers with weight normalization
20
+ and Leaky ReLU activation in between. The first convolution has a dilation,
21
+ while the second has a dilation of 1. A skip connection is added from the input
22
+ to the output.
23
+
24
+ Args:
25
+ channels (int): The number of input and output channels.
26
+ kernel_size (int): The kernel size of the convolutional layers.
27
+ dilation (int): The dilation rate for the first convolutional layer.
28
+
29
+ """
30
+
31
+ def __init__(self, channels, kernel_size, dilation):
32
+ super().__init__()
33
+ self.conv1 = weight_norm(
34
+ torch.nn.Conv1d(
35
+ channels,
36
+ channels,
37
+ kernel_size,
38
+ padding=(kernel_size * dilation - dilation) // 2,
39
+ dilation=dilation,
40
+ ),
41
+ )
42
+ self.conv2 = weight_norm(
43
+ torch.nn.Conv1d(
44
+ channels,
45
+ channels,
46
+ kernel_size,
47
+ padding=kernel_size // 2,
48
+ dilation=1,
49
+ ),
50
+ )
51
+
52
+ def forward(self, x: torch.Tensor):
53
+ y = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
54
+ y = self.conv1(y)
55
+ y = torch.nn.functional.leaky_relu(y, LRELU_SLOPE)
56
+ y = self.conv2(y)
57
+ return x + y
58
+
59
+ def remove_weight_norm(self):
60
+ remove_weight_norm(self.conv1)
61
+ remove_weight_norm(self.conv2)
62
+
63
+
64
+ class MRFBlock(torch.nn.Module):
65
+ """
66
+ A Multi-Receptive Field (MRF) block.
67
+
68
+ This block consists of multiple MRFLayers with different dilation rates.
69
+ It applies each layer sequentially to the input.
70
+
71
+ Args:
72
+ channels (int): The number of input and output channels for the MRFLayers.
73
+ kernel_size (int): The kernel size for the convolutional layers in the MRFLayers.
74
+ dilations (list[int]): A list of dilation rates for the MRFLayers.
75
+
76
+ """
77
+
78
+ def __init__(self, channels, kernel_size, dilations):
79
+ super().__init__()
80
+ self.layers = torch.nn.ModuleList()
81
+ for dilation in dilations:
82
+ self.layers.append(MRFLayer(channels, kernel_size, dilation))
83
+
84
+ def forward(self, x: torch.Tensor):
85
+ for layer in self.layers:
86
+ x = layer(x)
87
+ return x
88
+
89
+ def remove_weight_norm(self):
90
+ for layer in self.layers:
91
+ layer.remove_weight_norm()
92
+
93
+
94
+ class SineGenerator(torch.nn.Module):
95
+ """
96
+ Definition of sine generator
97
+
98
+ Generates sine waveforms with optional harmonics and additive noise.
99
+ Can be used to create harmonic noise source for neural vocoders.
100
+
101
+ Args:
102
+ samp_rate (int): Sampling rate in Hz.
103
+ harmonic_num (int): Number of harmonic overtones (default 0).
104
+ sine_amp (float): Amplitude of sine-waveform (default 0.1).
105
+ noise_std (float): Standard deviation of Gaussian noise (default 0.003).
106
+ voiced_threshold (float): F0 threshold for voiced/unvoiced classification (default 0).
107
+
108
+ """
109
+
110
+ def __init__(
111
+ self,
112
+ samp_rate: int,
113
+ harmonic_num: int = 0,
114
+ sine_amp: float = 0.1,
115
+ noise_std: float = 0.003,
116
+ voiced_threshold: float = 0,
117
+ ):
118
+ super().__init__()
119
+ self.sine_amp = sine_amp
120
+ self.noise_std = noise_std
121
+ self.harmonic_num = harmonic_num
122
+ self.dim = self.harmonic_num + 1
123
+ self.sampling_rate = samp_rate
124
+ self.voiced_threshold = voiced_threshold
125
+
126
+ def _f02uv(self, f0: torch.Tensor):
127
+ """
128
+ Generates voiced/unvoiced (UV) signal based on the fundamental frequency (F0).
129
+
130
+ Args:
131
+ f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length, 1).
132
+
133
+ """
134
+ # generate uv signal
135
+ uv = torch.ones_like(f0)
136
+ uv = uv * (f0 > self.voiced_threshold)
137
+ return uv
138
+
139
+ def _f02sine(self, f0_values: torch.Tensor):
140
+ """
141
+ Generates sine waveforms based on the fundamental frequency (F0) and its harmonics.
142
+
143
+ Args:
144
+ f0_values (torch.Tensor): Tensor of fundamental frequency and its harmonics,
145
+ shape (batch_size, length, dim), where dim indicates
146
+ the fundamental tone and overtones.
147
+
148
+ """
149
+ # convert to F0 in rad. The integer part n can be ignored
150
+ # because 2 * np.pi * n doesn't affect phase
151
+ rad_values = (f0_values / self.sampling_rate) % 1
152
+
153
+ # initial phase noise (no noise for fundamental component)
154
+ rand_ini = torch.rand(
155
+ f0_values.shape[0],
156
+ f0_values.shape[2],
157
+ device=f0_values.device,
158
+ )
159
+ rand_ini[:, 0] = 0
160
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
161
+
162
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
163
+ tmp_over_one = torch.cumsum(rad_values, 1) % 1
164
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
165
+ cumsum_shift = torch.zeros_like(rad_values)
166
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
167
+
168
+ sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
169
+
170
+ return sines
171
+
172
+ def forward(self, f0: torch.Tensor):
173
+ with torch.no_grad():
174
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
175
+ # fundamental component
176
+ f0_buf[:, :, 0] = f0[:, :, 0]
177
+ for idx in np.arange(self.harmonic_num):
178
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
179
+
180
+ sine_waves = self._f02sine(f0_buf) * self.sine_amp
181
+
182
+ uv = self._f02uv(f0)
183
+
184
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
185
+ noise = noise_amp * torch.randn_like(sine_waves)
186
+
187
+ sine_waves = sine_waves * uv + noise
188
+ return sine_waves, uv, noise
189
+
190
+
191
+ class SourceModuleHnNSF(torch.nn.Module):
192
+ """
193
+ Generates harmonic and noise source features.
194
+
195
+ This module uses the SineGenerator to create harmonic signals based on the
196
+ fundamental frequency (F0) and merges them into a single excitation signal.
197
+
198
+ Args:
199
+ sample_rate (int): Sampling rate in Hz.
200
+ harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0.
201
+ sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1.
202
+ add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003.
203
+ voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0.
204
+
205
+ """
206
+
207
+ def __init__(
208
+ self,
209
+ sampling_rate: int,
210
+ harmonic_num: int = 0,
211
+ sine_amp: float = 0.1,
212
+ add_noise_std: float = 0.003,
213
+ voiced_threshold: float = 0,
214
+ ):
215
+ super().__init__()
216
+
217
+ self.sine_amp = sine_amp
218
+ self.noise_std = add_noise_std
219
+
220
+ # to produce sine waveforms
221
+ self.l_sin_gen = SineGenerator(
222
+ sampling_rate,
223
+ harmonic_num,
224
+ sine_amp,
225
+ add_noise_std,
226
+ voiced_threshold,
227
+ )
228
+
229
+ # to merge source harmonics into a single excitation
230
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
231
+ self.l_tanh = torch.nn.Tanh()
232
+
233
+ def forward(self, x: torch.Tensor):
234
+ sine_wavs, uv, _ = self.l_sin_gen(x)
235
+ sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
236
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
237
+
238
+ return sine_merge, None, None
239
+
240
+
241
+ class HiFiGANMRFGenerator(torch.nn.Module):
242
+ """
243
+ HiFi-GAN generator with Multi-Receptive Field (MRF) blocks.
244
+
245
+ This generator takes an input feature sequence and fundamental frequency (F0)
246
+ as input and generates an audio waveform. It utilizes transposed convolutions
247
+ for upsampling and MRF blocks for feature refinement. It can also condition
248
+ on global conditioning features.
249
+
250
+ Args:
251
+ in_channel (int): Number of input channels.
252
+ upsample_initial_channel (int): Number of channels after the initial convolution.
253
+ upsample_rates (list[int]): List of upsampling rates for the transposed convolutions.
254
+ upsample_kernel_sizes (list[int]): List of kernel sizes for the transposed convolutions.
255
+ resblock_kernel_sizes (list[int]): List of kernel sizes for the convolutional layers in the MRF blocks.
256
+ resblock_dilations (list[list[int]]): List of lists of dilation rates for the MRF blocks.
257
+ gin_channels (int): Number of global conditioning input channels (0 if no global conditioning).
258
+ sample_rate (int): Sampling rate of the audio.
259
+ harmonic_num (int): Number of harmonics to generate.
260
+ checkpointing (bool): Whether to use checkpointing to save memory during training (default: False).
261
+
262
+ """
263
+
264
+ def __init__(
265
+ self,
266
+ in_channel: int,
267
+ upsample_initial_channel: int,
268
+ upsample_rates: list[int],
269
+ upsample_kernel_sizes: list[int],
270
+ resblock_kernel_sizes: list[int],
271
+ resblock_dilations: list[list[int]],
272
+ gin_channels: int,
273
+ sample_rate: int,
274
+ harmonic_num: int,
275
+ checkpointing: bool = False,
276
+ ):
277
+ super().__init__()
278
+ self.num_kernels = len(resblock_kernel_sizes)
279
+ self.checkpointing = checkpointing
280
+
281
+ self.f0_upsample = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
282
+ self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num)
283
+
284
+ self.conv_pre = weight_norm(
285
+ torch.nn.Conv1d(
286
+ in_channel,
287
+ upsample_initial_channel,
288
+ kernel_size=7,
289
+ stride=1,
290
+ padding=3,
291
+ ),
292
+ )
293
+ self.upsamples = torch.nn.ModuleList()
294
+ self.noise_convs = torch.nn.ModuleList()
295
+
296
+ stride_f0s = [
297
+ math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1
298
+ for i in range(len(upsample_rates))
299
+ ]
300
+
301
+ for i, (u, k) in enumerate(
302
+ zip(upsample_rates, upsample_kernel_sizes, strict=False),
303
+ ):
304
+ # handling odd upsampling rates
305
+ if u % 2 == 0:
306
+ # old method
307
+ padding = (k - u) // 2
308
+ else:
309
+ padding = u // 2 + u % 2
310
+
311
+ self.upsamples.append(
312
+ weight_norm(
313
+ torch.nn.ConvTranspose1d(
314
+ upsample_initial_channel // (2**i),
315
+ upsample_initial_channel // (2 ** (i + 1)),
316
+ kernel_size=k,
317
+ stride=u,
318
+ padding=padding,
319
+ output_padding=u % 2,
320
+ ),
321
+ ),
322
+ )
323
+ """ handling odd upsampling rates
324
+ # s k p
325
+ # 40 80 20
326
+ # 32 64 16
327
+ # 4 8 2
328
+ # 2 3 1
329
+ # 63 125 31
330
+ # 9 17 4
331
+ # 3 5 1
332
+ # 1 1 0
333
+ """
334
+ stride = stride_f0s[i]
335
+ kernel = 1 if stride == 1 else stride * 2 - stride % 2
336
+ padding = 0 if stride == 1 else (kernel - stride) // 2
337
+
338
+ self.noise_convs.append(
339
+ torch.nn.Conv1d(
340
+ 1,
341
+ upsample_initial_channel // (2 ** (i + 1)),
342
+ kernel_size=kernel,
343
+ stride=stride,
344
+ padding=padding,
345
+ ),
346
+ )
347
+ self.mrfs = torch.nn.ModuleList()
348
+ for i in range(len(self.upsamples)):
349
+ channel = upsample_initial_channel // (2 ** (i + 1))
350
+ self.mrfs.append(
351
+ torch.nn.ModuleList(
352
+ [
353
+ MRFBlock(channel, kernel_size=k, dilations=d)
354
+ for k, d in zip(
355
+ resblock_kernel_sizes,
356
+ resblock_dilations,
357
+ strict=False,
358
+ )
359
+ ],
360
+ ),
361
+ )
362
+ self.conv_post = weight_norm(
363
+ torch.nn.Conv1d(channel, 1, kernel_size=7, stride=1, padding=3),
364
+ )
365
+ if gin_channels != 0:
366
+ self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
367
+
368
+ def forward(
369
+ self,
370
+ x: torch.Tensor,
371
+ f0: torch.Tensor,
372
+ g: torch.Tensor | None = None,
373
+ ):
374
+ f0 = self.f0_upsample(f0[:, None, :]).transpose(-1, -2)
375
+ har_source, _, _ = self.m_source(f0)
376
+ har_source = har_source.transpose(-1, -2)
377
+ x = self.conv_pre(x)
378
+
379
+ if g is not None:
380
+ x = x + self.cond(g)
381
+
382
+ for ups, mrf, noise_conv in zip(
383
+ self.upsamples,
384
+ self.mrfs,
385
+ self.noise_convs,
386
+ strict=False,
387
+ ):
388
+ x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
389
+
390
+ if self.training and self.checkpointing:
391
+ x = checkpoint(ups, x, use_reentrant=False)
392
+ x = x + noise_conv(har_source)
393
+ xs = sum([checkpoint(layer, x, use_reentrant=False) for layer in mrf])
394
+ else:
395
+ x = ups(x)
396
+ x = x + noise_conv(har_source)
397
+ xs = sum([layer(x) for layer in mrf])
398
+ x = xs / self.num_kernels
399
+
400
+ x = torch.nn.functional.leaky_relu(x)
401
+ x = torch.tanh(self.conv_post(x))
402
+
403
+ return x
404
+
405
+ def remove_weight_norm(self):
406
+ remove_weight_norm(self.conv_pre)
407
+ for up in self.upsamples:
408
+ remove_weight_norm(up)
409
+ for mrf in self.mrfs:
410
+ mrf.remove_weight_norm()
411
+ remove_weight_norm(self.conv_post)
rvc_logic/rvc/lib/algorithm/generators/hifigan_nsf.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import math
4
+
5
+ import torch
6
+ from torch.nn.utils import remove_weight_norm
7
+ from torch.nn.utils.parametrizations import weight_norm
8
+ from torch.utils.checkpoint import checkpoint
9
+
10
+ from rvc_logic.rvc.lib.algorithm.commons import init_weights
11
+ from rvc_logic.rvc.lib.algorithm.generators.hifigan import SineGenerator
12
+ from rvc_logic.rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock
13
+
14
+
15
+ class SourceModuleHnNSF(torch.nn.Module):
16
+ """
17
+ Source Module for generating harmonic and noise components for audio synthesis.
18
+
19
+ This module generates a harmonic source signal using sine waves and adds
20
+ optional noise. It's often used in neural vocoders as a source of excitation.
21
+
22
+ Args:
23
+ sample_rate (int): Sampling rate of the audio in Hz.
24
+ harmonic_num (int, optional): Number of harmonic overtones to generate above the fundamental frequency (F0). Defaults to 0.
25
+ sine_amp (float, optional): Amplitude of the sine wave components. Defaults to 0.1.
26
+ add_noise_std (float, optional): Standard deviation of the additive white Gaussian noise. Defaults to 0.003.
27
+ voiced_threshod (float, optional): Threshold for the fundamental frequency (F0) to determine if a frame is voiced. If F0 is below this threshold, it's considered unvoiced. Defaults to 0.
28
+
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ sample_rate: int,
34
+ harmonic_num: int = 0,
35
+ sine_amp: float = 0.1,
36
+ add_noise_std: float = 0.003,
37
+ voiced_threshod: float = 0,
38
+ ):
39
+ super().__init__()
40
+
41
+ self.sine_amp = sine_amp
42
+ self.noise_std = add_noise_std
43
+
44
+ self.l_sin_gen = SineGenerator(
45
+ sample_rate,
46
+ harmonic_num,
47
+ sine_amp,
48
+ add_noise_std,
49
+ voiced_threshod,
50
+ )
51
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
52
+ self.l_tanh = torch.nn.Tanh()
53
+
54
+ def forward(self, x: torch.Tensor, upsample_factor: int = 1):
55
+ sine_wavs, uv, _ = self.l_sin_gen(x, upsample_factor)
56
+ sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
57
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
58
+ return sine_merge, None, None
59
+
60
+
61
+ class HiFiGANNSFGenerator(torch.nn.Module):
62
+ """
63
+ Generator module based on the Neural Source Filter (NSF) architecture.
64
+
65
+ This generator synthesizes audio by first generating a source excitation signal
66
+ (harmonic and noise) and then filtering it through a series of upsampling and
67
+ residual blocks. Global conditioning can be applied to influence the generation.
68
+
69
+ Args:
70
+ initial_channel (int): Number of input channels to the initial convolutional layer.
71
+ resblock_kernel_sizes (list): List of kernel sizes for the residual blocks.
72
+ resblock_dilation_sizes (list): List of lists of dilation rates for the residual blocks, corresponding to each kernel size.
73
+ upsample_rates (list): List of upsampling factors for each upsampling layer.
74
+ upsample_initial_channel (int): Number of output channels from the initial convolutional layer, which is also the input to the first upsampling layer.
75
+ upsample_kernel_sizes (list): List of kernel sizes for the transposed convolutional layers used for upsampling.
76
+ gin_channels (int): Number of input channels for the global conditioning. If 0, no global conditioning is used.
77
+ sr (int): Sampling rate of the audio.
78
+ checkpointing (bool, optional): Whether to use gradient checkpointing to save memory during training. Defaults to False.
79
+
80
+ """
81
+
82
+ def __init__(
83
+ self,
84
+ initial_channel: int,
85
+ resblock_kernel_sizes: list,
86
+ resblock_dilation_sizes: list,
87
+ upsample_rates: list,
88
+ upsample_initial_channel: int,
89
+ upsample_kernel_sizes: list,
90
+ gin_channels: int,
91
+ sr: int,
92
+ checkpointing: bool = False,
93
+ ):
94
+ super().__init__()
95
+
96
+ self.num_kernels = len(resblock_kernel_sizes)
97
+ self.num_upsamples = len(upsample_rates)
98
+ self.checkpointing = checkpointing
99
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates))
100
+ self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0)
101
+
102
+ self.conv_pre = torch.nn.Conv1d(
103
+ initial_channel,
104
+ upsample_initial_channel,
105
+ 7,
106
+ 1,
107
+ padding=3,
108
+ )
109
+
110
+ self.ups = torch.nn.ModuleList()
111
+ self.noise_convs = torch.nn.ModuleList()
112
+
113
+ channels = [
114
+ upsample_initial_channel // (2 ** (i + 1))
115
+ for i in range(len(upsample_rates))
116
+ ]
117
+ stride_f0s = [
118
+ math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1
119
+ for i in range(len(upsample_rates))
120
+ ]
121
+
122
+ for i, (u, k) in enumerate(
123
+ zip(upsample_rates, upsample_kernel_sizes, strict=False),
124
+ ):
125
+ # handling odd upsampling rates
126
+ if u % 2 == 0:
127
+ # old method
128
+ padding = (k - u) // 2
129
+ else:
130
+ padding = u // 2 + u % 2
131
+
132
+ self.ups.append(
133
+ weight_norm(
134
+ torch.nn.ConvTranspose1d(
135
+ upsample_initial_channel // (2**i),
136
+ channels[i],
137
+ k,
138
+ u,
139
+ padding=padding,
140
+ output_padding=u % 2,
141
+ ),
142
+ ),
143
+ )
144
+ """ handling odd upsampling rates
145
+ # s k p
146
+ # 40 80 20
147
+ # 32 64 16
148
+ # 4 8 2
149
+ # 2 3 1
150
+ # 63 125 31
151
+ # 9 17 4
152
+ # 3 5 1
153
+ # 1 1 0
154
+ """
155
+ stride = stride_f0s[i]
156
+ kernel = 1 if stride == 1 else stride * 2 - stride % 2
157
+ padding = 0 if stride == 1 else (kernel - stride) // 2
158
+
159
+ self.noise_convs.append(
160
+ torch.nn.Conv1d(
161
+ 1,
162
+ channels[i],
163
+ kernel_size=kernel,
164
+ stride=stride,
165
+ padding=padding,
166
+ ),
167
+ )
168
+
169
+ self.resblocks = torch.nn.ModuleList(
170
+ [
171
+ ResBlock(channels[i], k, d)
172
+ for i in range(len(self.ups))
173
+ for k, d in zip(
174
+ resblock_kernel_sizes,
175
+ resblock_dilation_sizes,
176
+ strict=False,
177
+ )
178
+ ],
179
+ )
180
+
181
+ self.conv_post = torch.nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False)
182
+ self.ups.apply(init_weights)
183
+
184
+ if gin_channels != 0:
185
+ self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
186
+
187
+ self.upp = math.prod(upsample_rates)
188
+ self.lrelu_slope = LRELU_SLOPE
189
+
190
+ def forward(
191
+ self,
192
+ x: torch.Tensor,
193
+ f0: torch.Tensor,
194
+ g: torch.Tensor | None = None,
195
+ ):
196
+ har_source, _, _ = self.m_source(f0, self.upp)
197
+ har_source = har_source.transpose(1, 2)
198
+ # new tensor
199
+ x = self.conv_pre(x)
200
+
201
+ if g is not None:
202
+ x = x + self.cond(g)
203
+
204
+ for i, (ups, noise_convs) in enumerate(
205
+ zip(self.ups, self.noise_convs, strict=False),
206
+ ):
207
+ x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
208
+ # Apply upsampling layer
209
+ if self.training and self.checkpointing:
210
+ x = checkpoint(ups, x, use_reentrant=False)
211
+ x = x + noise_convs(har_source)
212
+ xs = sum(
213
+ [
214
+ checkpoint(resblock, x, use_reentrant=False)
215
+ for j, resblock in enumerate(self.resblocks)
216
+ if j in range(i * self.num_kernels, (i + 1) * self.num_kernels)
217
+ ],
218
+ )
219
+ else:
220
+ x = ups(x)
221
+ x = x + noise_convs(har_source)
222
+ xs = sum(
223
+ [
224
+ resblock(x)
225
+ for j, resblock in enumerate(self.resblocks)
226
+ if j in range(i * self.num_kernels, (i + 1) * self.num_kernels)
227
+ ],
228
+ )
229
+ x = xs / self.num_kernels
230
+
231
+ x = torch.nn.functional.leaky_relu(x)
232
+ x = torch.tanh(self.conv_post(x))
233
+
234
+ return x
235
+
236
+ def remove_weight_norm(self):
237
+ for l in self.ups:
238
+ remove_weight_norm(l)
239
+ for l in self.resblocks:
240
+ l.remove_weight_norm()
241
+
242
+ def __prepare_scriptable__(self):
243
+ for l in self.ups:
244
+ for hook in l._forward_pre_hooks.values():
245
+ if (
246
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
247
+ and hook.__class__.__name__ == "WeightNorm"
248
+ ):
249
+ remove_weight_norm(l)
250
+ for l in self.resblocks:
251
+ for hook in l._forward_pre_hooks.values():
252
+ if (
253
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
254
+ and hook.__class__.__name__ == "WeightNorm"
255
+ ):
256
+ remove_weight_norm(l)
257
+ return self
258
+ rn self
rvc_logic/rvc/lib/algorithm/generators/refinegan.py ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ import torch
4
+ import torchaudio
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+ from torch.nn.utils import remove_weight_norm
8
+ from torch.nn.utils.parametrizations import weight_norm
9
+ from torch.utils.checkpoint import checkpoint
10
+
11
+ from rvc_logic.rvc.lib.algorithm.commons import get_padding, init_weights
12
+
13
+
14
+ class ResBlock(nn.Module):
15
+ """
16
+ Residual block with multiple dilated convolutions.
17
+
18
+ This block applies a sequence of dilated convolutional layers with Leaky ReLU activation.
19
+ It's designed to capture information at different scales due to the varying dilation rates.
20
+
21
+ Args:
22
+ in_channels (int): Number of input channels.
23
+ out_channels (int): Number of output channels.
24
+ kernel_size (int, optional): Kernel size for the convolutional layers. Defaults to 7.
25
+ dilation (tuple[int], optional): Tuple of dilation rates for the convolutional layers. Defaults to (1, 3, 5).
26
+ leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2.
27
+
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ channels: int,
33
+ kernel_size: int = 7,
34
+ dilation: tuple[int] = (1, 3, 5),
35
+ leaky_relu_slope: float = 0.2,
36
+ ):
37
+ super().__init__()
38
+
39
+ self.leaky_relu_slope = leaky_relu_slope
40
+
41
+ self.convs1 = nn.ModuleList(
42
+ [
43
+ weight_norm(
44
+ nn.Conv1d(
45
+ channels,
46
+ channels,
47
+ kernel_size,
48
+ stride=1,
49
+ dilation=d,
50
+ padding=get_padding(kernel_size, d),
51
+ ),
52
+ )
53
+ for d in dilation
54
+ ],
55
+ )
56
+ self.convs1.apply(init_weights)
57
+
58
+ self.convs2 = nn.ModuleList(
59
+ [
60
+ weight_norm(
61
+ nn.Conv1d(
62
+ channels,
63
+ channels,
64
+ kernel_size,
65
+ stride=1,
66
+ dilation=1,
67
+ padding=get_padding(kernel_size, 1),
68
+ ),
69
+ )
70
+ for d in dilation
71
+ ],
72
+ )
73
+ self.convs2.apply(init_weights)
74
+
75
+ def forward(self, x: torch.Tensor):
76
+ for c1, c2 in zip(self.convs1, self.convs2, strict=False):
77
+ xt = F.leaky_relu(x, self.leaky_relu_slope)
78
+ xt = c1(xt)
79
+ xt = F.leaky_relu(xt, self.leaky_relu_slope)
80
+ xt = c2(xt)
81
+ x = xt + x
82
+
83
+ return x
84
+
85
+ def remove_weight_norm(self):
86
+ for c1, c2 in zip(self.convs1, self.convs2, strict=False):
87
+ remove_weight_norm(c1)
88
+ remove_weight_norm(c2)
89
+
90
+
91
+ class AdaIN(nn.Module):
92
+ """
93
+ Adaptive Instance Normalization layer.
94
+
95
+ This layer applies a scaling factor to the input based on a learnable weight.
96
+
97
+ Args:
98
+ channels (int): Number of input channels.
99
+ leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation applied after scaling. Defaults to 0.2.
100
+
101
+ """
102
+
103
+ def __init__(
104
+ self,
105
+ *,
106
+ channels: int,
107
+ leaky_relu_slope: float = 0.2,
108
+ ):
109
+ super().__init__()
110
+
111
+ self.weight = nn.Parameter(torch.ones(channels) * 1e-4)
112
+ # safe to use in-place as it is used on a new x+gaussian tensor
113
+ self.activation = nn.LeakyReLU(leaky_relu_slope)
114
+
115
+ def forward(self, x: torch.Tensor):
116
+ gaussian = torch.randn_like(x) * self.weight[None, :, None]
117
+
118
+ return self.activation(x + gaussian)
119
+
120
+
121
+ class ParallelResBlock(nn.Module):
122
+ """
123
+ Parallel residual block that applies multiple residual blocks with different kernel sizes in parallel.
124
+
125
+ Args:
126
+ in_channels (int): Number of input channels.
127
+ out_channels (int): Number of output channels.
128
+ kernel_sizes (tuple[int], optional): Tuple of kernel sizes for the parallel residual blocks. Defaults to (3, 7, 11).
129
+ dilation (tuple[int], optional): Tuple of dilation rates for the convolutional layers within the residual blocks. Defaults to (1, 3, 5).
130
+ leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2.
131
+
132
+ """
133
+
134
+ def __init__(
135
+ self,
136
+ *,
137
+ in_channels: int,
138
+ out_channels: int,
139
+ kernel_sizes: tuple[int] = (3, 7, 11),
140
+ dilation: tuple[int] = (1, 3, 5),
141
+ leaky_relu_slope: float = 0.2,
142
+ ):
143
+ super().__init__()
144
+
145
+ self.in_channels = in_channels
146
+ self.out_channels = out_channels
147
+
148
+ self.input_conv = nn.Conv1d(
149
+ in_channels=in_channels,
150
+ out_channels=out_channels,
151
+ kernel_size=7,
152
+ stride=1,
153
+ padding=3,
154
+ )
155
+
156
+ self.input_conv.apply(init_weights)
157
+
158
+ self.blocks = nn.ModuleList(
159
+ [
160
+ nn.Sequential(
161
+ AdaIN(channels=out_channels),
162
+ ResBlock(
163
+ out_channels,
164
+ kernel_size=kernel_size,
165
+ dilation=dilation,
166
+ leaky_relu_slope=leaky_relu_slope,
167
+ ),
168
+ AdaIN(channels=out_channels),
169
+ )
170
+ for kernel_size in kernel_sizes
171
+ ],
172
+ )
173
+
174
+ def forward(self, x: torch.Tensor):
175
+ x = self.input_conv(x)
176
+ return torch.stack([block(x) for block in self.blocks], dim=0).mean(dim=0)
177
+
178
+ def remove_weight_norm(self):
179
+ remove_weight_norm(self.input_conv)
180
+ for block in self.blocks:
181
+ block[1].remove_weight_norm()
182
+
183
+
184
+ class SineGenerator(nn.Module):
185
+ """
186
+ Definition of sine generator
187
+
188
+ Generates sine waveforms with optional harmonics and additive noise.
189
+ Can be used to create harmonic noise source for neural vocoders.
190
+
191
+ Args:
192
+ samp_rate (int): Sampling rate in Hz.
193
+ harmonic_num (int): Number of harmonic overtones (default 0).
194
+ sine_amp (float): Amplitude of sine-waveform (default 0.1).
195
+ noise_std (float): Standard deviation of Gaussian noise (default 0.003).
196
+ voiced_threshold (float): F0 threshold for voiced/unvoiced classification (default 0).
197
+
198
+ """
199
+
200
+ def __init__(
201
+ self,
202
+ samp_rate,
203
+ harmonic_num=0,
204
+ sine_amp=0.1,
205
+ noise_std=0.003,
206
+ voiced_threshold=0,
207
+ ):
208
+ super().__init__()
209
+ self.sine_amp = sine_amp
210
+ self.noise_std = noise_std
211
+ self.harmonic_num = harmonic_num
212
+ self.dim = self.harmonic_num + 1
213
+ self.sampling_rate = samp_rate
214
+ self.voiced_threshold = voiced_threshold
215
+
216
+ self.merge = nn.Sequential(
217
+ nn.Linear(self.dim, 1, bias=False),
218
+ nn.Tanh(),
219
+ )
220
+
221
+ def _f02uv(self, f0):
222
+ # generate uv signal
223
+ uv = torch.ones_like(f0)
224
+ uv = uv * (f0 > self.voiced_threshold)
225
+ return uv
226
+
227
+ def _f02sine(self, f0_values):
228
+ """
229
+ f0_values: (batchsize, length, dim)
230
+ where dim indicates fundamental tone and overtones
231
+ """
232
+ # convert to F0 in rad. The integer part n can be ignored
233
+ # because 2 * np.pi * n doesn't affect phase
234
+ rad_values = (f0_values / self.sampling_rate) % 1
235
+
236
+ # initial phase noise (no noise for fundamental component)
237
+ rand_ini = torch.rand(
238
+ f0_values.shape[0],
239
+ f0_values.shape[2],
240
+ device=f0_values.device,
241
+ )
242
+ rand_ini[:, 0] = 0
243
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
244
+
245
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
246
+ tmp_over_one = torch.cumsum(rad_values, 1) % 1
247
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
248
+ cumsum_shift = torch.zeros_like(rad_values)
249
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
250
+
251
+ sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
252
+
253
+ return sines
254
+
255
+ def forward(self, f0):
256
+ with torch.no_grad():
257
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
258
+ # fundamental component
259
+ f0_buf[:, :, 0] = f0[:, :, 0]
260
+ for idx in np.arange(self.harmonic_num):
261
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
262
+
263
+ sine_waves = self._f02sine(f0_buf) * self.sine_amp
264
+
265
+ uv = self._f02uv(f0)
266
+
267
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
268
+ noise = noise_amp * torch.randn_like(sine_waves)
269
+
270
+ sine_waves = sine_waves * uv + noise
271
+
272
+ # merge with grad
273
+ return self.merge(sine_waves)
274
+
275
+
276
+ class RefineGANGenerator(nn.Module):
277
+ """
278
+ RefineGAN generator for audio synthesis.
279
+
280
+ This generator uses a combination of downsampling, residual blocks, and parallel residual blocks
281
+ to refine an input mel-spectrogram and fundamental frequency (F0) into an audio waveform.
282
+ It can also incorporate global conditioning.
283
+
284
+ Args:
285
+ sample_rate (int, optional): Sampling rate of the audio. Defaults to 44100.
286
+ downsample_rates (tuple[int], optional): Downsampling rates for the downsampling blocks. Defaults to (2, 2, 8, 8).
287
+ upsample_rates (tuple[int], optional): Upsampling rates for the upsampling blocks. Defaults to (8, 8, 2, 2).
288
+ leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2.
289
+ num_mels (int, optional): Number of mel-frequency bins in the input mel-spectrogram. Defaults to 128.
290
+ start_channels (int, optional): Number of channels in the initial convolutional layer. Defaults to 16.
291
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 256.
292
+ checkpointing (bool, optional): Whether to use checkpointing for memory efficiency. Defaults to False.
293
+
294
+ """
295
+
296
+ def __init__(
297
+ self,
298
+ *,
299
+ sample_rate: int = 44100,
300
+ downsample_rates: tuple[int] = (2, 2, 8, 8), # unused
301
+ upsample_rates: tuple[int] = (8, 8, 2, 2),
302
+ leaky_relu_slope: float = 0.2,
303
+ num_mels: int = 128,
304
+ start_channels: int = 16, # unused
305
+ gin_channels: int = 256,
306
+ checkpointing: bool = False,
307
+ upsample_initial_channel=512,
308
+ ):
309
+ super().__init__()
310
+ self.upsample_rates = upsample_rates
311
+ self.leaky_relu_slope = leaky_relu_slope
312
+ self.checkpointing = checkpointing
313
+
314
+ self.upp = np.prod(upsample_rates)
315
+ self.m_source = SineGenerator(sample_rate)
316
+
317
+ # expanded f0 sinegen -> match mel_conv
318
+ # (8, 1, 17280) -> (8, 16, 17280)
319
+ self.pre_conv = weight_norm(
320
+ nn.Conv1d(1, 16, 7, 1, padding=3),
321
+ )
322
+
323
+ # (8, 16, 17280) = 4th upscale
324
+ # (8, 32, 8640) = 3rd upscale
325
+ # (8, 64, 4320) = 2nd upscale
326
+ # (8, 128, 432) = 1st upscale
327
+ # (8, 256, 36) merged to mel
328
+
329
+ # f0 downsampling and upchanneling
330
+ channels = start_channels
331
+ size = self.upp
332
+ self.downsample_blocks = nn.ModuleList([])
333
+ self.df0 = []
334
+ for i, u in enumerate(upsample_rates):
335
+
336
+ new_size = int(size / upsample_rates[-i - 1])
337
+ # T dimension factors for torchaudio.functional.resample
338
+ self.df0.append([size, new_size])
339
+ size = new_size
340
+
341
+ new_channels = channels * 2
342
+ self.downsample_blocks.append(
343
+ weight_norm(
344
+ nn.Conv1d(
345
+ channels,
346
+ new_channels,
347
+ 7,
348
+ 1,
349
+ padding=3,
350
+ ),
351
+ ),
352
+ )
353
+ channels = new_channels
354
+
355
+ # mel handling
356
+ channels = upsample_initial_channel
357
+
358
+ self.mel_conv = weight_norm(
359
+ nn.Conv1d(num_mels, channels // 2, 7, 1, padding=3),
360
+ )
361
+
362
+ self.mel_conv.apply(init_weights)
363
+
364
+ if gin_channels != 0:
365
+ self.cond = nn.Conv1d(256, channels // 2, 1)
366
+
367
+ self.upsample_blocks = nn.ModuleList([])
368
+ self.upsample_conv_blocks = nn.ModuleList([])
369
+
370
+ for rate in upsample_rates:
371
+ new_channels = channels // 2
372
+
373
+ self.upsample_blocks.append(nn.Upsample(scale_factor=rate, mode="linear"))
374
+
375
+ self.upsample_conv_blocks.append(
376
+ ParallelResBlock(
377
+ in_channels=channels + channels // 4,
378
+ out_channels=new_channels,
379
+ kernel_sizes=(3, 7, 11),
380
+ dilation=(1, 3, 5),
381
+ leaky_relu_slope=leaky_relu_slope,
382
+ ),
383
+ )
384
+
385
+ channels = new_channels
386
+
387
+ self.conv_post = weight_norm(
388
+ nn.Conv1d(channels, 1, 7, 1, padding=3, bias=False),
389
+ )
390
+ self.conv_post.apply(init_weights)
391
+
392
+ def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor = None):
393
+ f0_size = mel.shape[-1]
394
+ # change f0 helper to full size
395
+ f0 = F.interpolate(
396
+ f0.unsqueeze(1),
397
+ size=f0_size * self.upp,
398
+ mode="linear",
399
+ )
400
+ # get f0 turned into sines harmonics
401
+ har_source = self.m_source(f0.transpose(1, 2)).transpose(1, 2)
402
+ # prepare for fusion to mel
403
+ x = self.pre_conv(har_source)
404
+ # downsampled/upchanneled versions for each upscale
405
+ downs = []
406
+ for block, (old_size, new_size) in zip(self.downsample_blocks, self.df0):
407
+ x = F.leaky_relu(x, self.leaky_relu_slope)
408
+ downs.append(x)
409
+ # attempt to cancel spectral aliasing
410
+ x = torchaudio.functional.resample(
411
+ x.contiguous(),
412
+ orig_freq=int(f0_size * old_size),
413
+ new_freq=int(f0_size * new_size),
414
+ lowpass_filter_width=64,
415
+ rolloff=0.9475937167399596,
416
+ resampling_method="sinc_interp_kaiser",
417
+ beta=14.769656459379492,
418
+ )
419
+ x = block(x)
420
+
421
+ # expanding spectrogram from 192 to 256 channels
422
+ mel = self.mel_conv(mel)
423
+
424
+ if g is not None:
425
+ # adding expanded speaker embedding
426
+ mel = mel + self.cond(g)
427
+ x = torch.cat([mel, x], dim=1)
428
+
429
+ for ups, res, down in zip(
430
+ self.upsample_blocks,
431
+ self.upsample_conv_blocks,
432
+ reversed(downs),
433
+ strict=False,
434
+ ):
435
+ x = F.leaky_relu(x, self.leaky_relu_slope)
436
+
437
+ if self.training and self.checkpointing:
438
+ x = checkpoint(ups, x, use_reentrant=False)
439
+ x = torch.cat([x, down], dim=1)
440
+ x = checkpoint(res, x, use_reentrant=False)
441
+ else:
442
+ x = ups(x)
443
+ x = torch.cat([x, down], dim=1)
444
+ x = res(x)
445
+
446
+ x = F.leaky_relu(x, self.leaky_relu_slope)
447
+ x = self.conv_post(x)
448
+ x = torch.tanh(x)
449
+
450
+ return x
451
+
452
+ def remove_weight_norm(self):
453
+ remove_weight_norm(self.pre_conv)
454
+ remove_weight_norm(self.mel_conv)
455
+ remove_weight_norm(self.conv_post)
456
+
457
+ for block in self.downsample_blocks:
458
+ block.remove_weight_norm()
459
+
460
+ for block in self.upsample_conv_blocks:
461
+ block.remove_weight_norm()
462
+ )
rvc_logic/rvc/lib/algorithm/modules.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from rvc_logic.rvc.lib.algorithm.commons import fused_add_tanh_sigmoid_multiply
4
+
5
+
6
+ class WaveNet(torch.nn.Module):
7
+ """
8
+ WaveNet residual blocks as used in WaveGlow.
9
+
10
+ Args:
11
+ hidden_channels (int): Number of hidden channels.
12
+ kernel_size (int): Size of the convolutional kernel.
13
+ dilation_rate (int): Dilation rate of the convolution.
14
+ n_layers (int): Number of convolutional layers.
15
+ gin_channels (int, optional): Number of conditioning channels. Defaults to 0.
16
+ p_dropout (float, optional): Dropout probability. Defaults to 0.
17
+
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ hidden_channels: int,
23
+ kernel_size: int,
24
+ dilation_rate,
25
+ n_layers: int,
26
+ gin_channels: int = 0,
27
+ p_dropout: int = 0,
28
+ ):
29
+ super().__init__()
30
+ assert kernel_size % 2 == 1, "Kernel size must be odd for proper padding."
31
+
32
+ self.hidden_channels = hidden_channels
33
+ self.kernel_size = (kernel_size,)
34
+ self.dilation_rate = dilation_rate
35
+ self.n_layers = n_layers
36
+ self.gin_channels = gin_channels
37
+ self.p_dropout = p_dropout
38
+ self.n_channels_tensor = torch.IntTensor([hidden_channels]) # Static tensor
39
+
40
+ self.in_layers = torch.nn.ModuleList()
41
+ self.res_skip_layers = torch.nn.ModuleList()
42
+ self.drop = torch.nn.Dropout(p_dropout)
43
+
44
+ # Conditional layer for global conditioning
45
+ if gin_channels:
46
+ self.cond_layer = torch.nn.utils.parametrizations.weight_norm(
47
+ torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1),
48
+ name="weight",
49
+ )
50
+
51
+ # Precompute dilations and paddings
52
+ dilations = [dilation_rate**i for i in range(n_layers)]
53
+ paddings = [(kernel_size * d - d) // 2 for d in dilations]
54
+
55
+ # Initialize layers
56
+ for i in range(n_layers):
57
+ self.in_layers.append(
58
+ torch.nn.utils.parametrizations.weight_norm(
59
+ torch.nn.Conv1d(
60
+ hidden_channels,
61
+ 2 * hidden_channels,
62
+ kernel_size,
63
+ dilation=dilations[i],
64
+ padding=paddings[i],
65
+ ),
66
+ name="weight",
67
+ ),
68
+ )
69
+
70
+ res_skip_channels = (
71
+ hidden_channels if i == n_layers - 1 else 2 * hidden_channels
72
+ )
73
+ self.res_skip_layers.append(
74
+ torch.nn.utils.parametrizations.weight_norm(
75
+ torch.nn.Conv1d(hidden_channels, res_skip_channels, 1),
76
+ name="weight",
77
+ ),
78
+ )
79
+
80
+ def forward(self, x, x_mask, g=None):
81
+ output = x.clone().zero_()
82
+
83
+ # Apply conditional layer if global conditioning is provided
84
+ g = self.cond_layer(g) if g is not None else None
85
+
86
+ for i in range(self.n_layers):
87
+ x_in = self.in_layers[i](x)
88
+ g_l = (
89
+ g[
90
+ :,
91
+ i * 2 * self.hidden_channels : (i + 1) * 2 * self.hidden_channels,
92
+ :,
93
+ ]
94
+ if g is not None
95
+ else 0
96
+ )
97
+
98
+ # Activation with fused Tanh-Sigmoid
99
+ acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, self.n_channels_tensor)
100
+ acts = self.drop(acts)
101
+
102
+ # Residual and skip connections
103
+ res_skip_acts = self.res_skip_layers[i](acts)
104
+ if i < self.n_layers - 1:
105
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
106
+ x = (x + res_acts) * x_mask
107
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
108
+ else:
109
+ output = output + res_skip_acts
110
+
111
+ return output * x_mask
112
+
113
+ def remove_weight_norm(self):
114
+ if self.gin_channels:
115
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
116
+ for layer in self.in_layers:
117
+ torch.nn.utils.remove_weight_norm(layer)
118
+ for layer in self.res_skip_layers:
119
+ torch.nn.utils.remove_weight_norm(layer)
120
+ )
rvc_logic/rvc/lib/algorithm/normalization.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ class LayerNorm(torch.nn.Module):
5
+ """
6
+ Layer normalization module.
7
+
8
+ Args:
9
+ channels (int): Number of channels.
10
+ eps (float, optional): Epsilon value for numerical stability. Defaults to 1e-5.
11
+
12
+ """
13
+
14
+ def __init__(self, channels: int, eps: float = 1e-5):
15
+ super().__init__()
16
+ self.eps = eps
17
+ self.gamma = torch.nn.Parameter(torch.ones(channels))
18
+ self.beta = torch.nn.Parameter(torch.zeros(channels))
19
+
20
+ def forward(self, x):
21
+ # Transpose to (batch_size, time_steps, channels) for layer_norm
22
+ x = x.transpose(1, -1)
23
+ x = torch.nn.functional.layer_norm(
24
+ x,
25
+ (x.size(-1),),
26
+ self.gamma,
27
+ self.beta,
28
+ self.eps,
29
+ )
30
+ # Transpose back to (batch_size, channels, time_steps)
31
+ return x.transpose(1, -1)
rvc_logic/rvc/lib/algorithm/residuals.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+
3
+ from itertools import chain
4
+
5
+ import torch
6
+ from torch.nn.utils import remove_weight_norm
7
+ from torch.nn.utils.parametrizations import weight_norm
8
+
9
+ from rvc_logic.rvc.lib.algorithm.commons import get_padding, init_weights
10
+ from rvc_logic.rvc.lib.algorithm.modules import WaveNet
11
+
12
+ LRELU_SLOPE = 0.1
13
+
14
+
15
+ def create_conv1d_layer(channels, kernel_size, dilation):
16
+ return weight_norm(
17
+ torch.nn.Conv1d(
18
+ channels,
19
+ channels,
20
+ kernel_size,
21
+ 1,
22
+ dilation=dilation,
23
+ padding=get_padding(kernel_size, dilation),
24
+ ),
25
+ )
26
+
27
+
28
+ def apply_mask(tensor: torch.Tensor, mask: torch.Tensor | None):
29
+ return tensor * mask if mask else tensor
30
+
31
+
32
+ def apply_mask_(tensor: torch.Tensor, mask: torch.Tensor | None):
33
+ return tensor.mul_(mask) if mask else tensor
34
+
35
+
36
+ class ResBlock(torch.nn.Module):
37
+ """
38
+ A residual block module that applies a series of 1D convolutional layers with residual connections.
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ channels: int,
44
+ kernel_size: int = 3,
45
+ dilations: tuple[int] = (1, 3, 5),
46
+ ):
47
+ """
48
+ Initializes the ResBlock.
49
+
50
+ Args:
51
+ channels (int): Number of input and output channels for the convolution layers.
52
+ kernel_size (int): Size of the convolution kernel. Defaults to 3.
53
+ dilations (Tuple[int]): Tuple of dilation rates for the convolution layers in the first set.
54
+
55
+ """
56
+ super().__init__()
57
+ # Create convolutional layers with specified dilations and initialize weights
58
+ self.convs1 = self._create_convs(channels, kernel_size, dilations)
59
+ self.convs2 = self._create_convs(channels, kernel_size, [1] * len(dilations))
60
+
61
+ @staticmethod
62
+ def _create_convs(channels: int, kernel_size: int, dilations: tuple[int]):
63
+ """
64
+ Creates a list of 1D convolutional layers with specified dilations.
65
+
66
+ Args:
67
+ channels (int): Number of input and output channels for the convolution layers.
68
+ kernel_size (int): Size of the convolution kernel.
69
+ dilations (Tuple[int]): Tuple of dilation rates for each convolution layer.
70
+
71
+ """
72
+ layers = torch.nn.ModuleList(
73
+ [create_conv1d_layer(channels, kernel_size, d) for d in dilations],
74
+ )
75
+ layers.apply(init_weights)
76
+ return layers
77
+
78
+ def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None):
79
+ for conv1, conv2 in zip(self.convs1, self.convs2, strict=False):
80
+ x_residual = x
81
+ x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
82
+ x = apply_mask(x, x_mask)
83
+ x = torch.nn.functional.leaky_relu(conv1(x), LRELU_SLOPE)
84
+ x = apply_mask(x, x_mask)
85
+ x = conv2(x)
86
+ x = x + x_residual
87
+ return apply_mask(x, x_mask)
88
+
89
+ def remove_weight_norm(self):
90
+ for conv in chain(self.convs1, self.convs2):
91
+ remove_weight_norm(conv)
92
+
93
+
94
+ class Flip(torch.nn.Module):
95
+ """
96
+ Flip module for flow-based models.
97
+
98
+ This module flips the input along the time dimension.
99
+ """
100
+
101
+ def forward(self, x, *args, reverse=False, **kwargs):
102
+ x = torch.flip(x, [1])
103
+ if not reverse:
104
+ logdet = torch.zeros(x.size(0), dtype=x.dtype, device=x.device)
105
+ return x, logdet
106
+ return x
107
+
108
+
109
+ class ResidualCouplingBlock(torch.nn.Module):
110
+ """
111
+ Residual Coupling Block for normalizing flow.
112
+
113
+ Args:
114
+ channels (int): Number of channels in the input.
115
+ hidden_channels (int): Number of hidden channels in the coupling layer.
116
+ kernel_size (int): Kernel size of the convolutional layers.
117
+ dilation_rate (int): Dilation rate of the convolutional layers.
118
+ n_layers (int): Number of layers in the coupling layer.
119
+ n_flows (int, optional): Number of coupling layers in the block. Defaults to 4.
120
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
121
+
122
+ """
123
+
124
+ def __init__(
125
+ self,
126
+ channels: int,
127
+ hidden_channels: int,
128
+ kernel_size: int,
129
+ dilation_rate: int,
130
+ n_layers: int,
131
+ n_flows: int = 4,
132
+ gin_channels: int = 0,
133
+ ):
134
+ super().__init__()
135
+ self.channels = channels
136
+ self.hidden_channels = hidden_channels
137
+ self.kernel_size = kernel_size
138
+ self.dilation_rate = dilation_rate
139
+ self.n_layers = n_layers
140
+ self.n_flows = n_flows
141
+ self.gin_channels = gin_channels
142
+
143
+ self.flows = torch.nn.ModuleList()
144
+ for _ in range(n_flows):
145
+ self.flows.append(
146
+ ResidualCouplingLayer(
147
+ channels,
148
+ hidden_channels,
149
+ kernel_size,
150
+ dilation_rate,
151
+ n_layers,
152
+ gin_channels=gin_channels,
153
+ mean_only=True,
154
+ ),
155
+ )
156
+ self.flows.append(Flip())
157
+
158
+ def forward(
159
+ self,
160
+ x: torch.Tensor,
161
+ x_mask: torch.Tensor,
162
+ g: torch.Tensor | None = None,
163
+ reverse: bool = False,
164
+ ):
165
+ if not reverse:
166
+ for flow in self.flows:
167
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
168
+ else:
169
+ for flow in reversed(self.flows):
170
+ x = flow.forward(x, x_mask, g=g, reverse=reverse)
171
+ return x
172
+
173
+ def remove_weight_norm(self):
174
+ for i in range(self.n_flows):
175
+ self.flows[i * 2].remove_weight_norm()
176
+
177
+ def __prepare_scriptable__(self):
178
+ for i in range(self.n_flows):
179
+ for hook in self.flows[i * 2]._forward_pre_hooks.values():
180
+ if (
181
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
182
+ and hook.__class__.__name__ == "WeightNorm"
183
+ ):
184
+ torch.nn.utils.remove_weight_norm(self.flows[i * 2])
185
+
186
+ return self
187
+
188
+
189
+ class ResidualCouplingLayer(torch.nn.Module):
190
+ """
191
+ Residual coupling layer for flow-based models.
192
+
193
+ Args:
194
+ channels (int): Number of channels.
195
+ hidden_channels (int): Number of hidden channels.
196
+ kernel_size (int): Size of the convolutional kernel.
197
+ dilation_rate (int): Dilation rate of the convolution.
198
+ n_layers (int): Number of convolutional layers.
199
+ p_dropout (float, optional): Dropout probability. Defaults to 0.
200
+ gin_channels (int, optional): Number of conditioning channels. Defaults to 0.
201
+ mean_only (bool, optional): Whether to use mean-only coupling. Defaults to False.
202
+
203
+ """
204
+
205
+ def __init__(
206
+ self,
207
+ channels: int,
208
+ hidden_channels: int,
209
+ kernel_size: int,
210
+ dilation_rate: int,
211
+ n_layers: int,
212
+ p_dropout: float = 0,
213
+ gin_channels: int = 0,
214
+ mean_only: bool = False,
215
+ ):
216
+ assert channels % 2 == 0, "channels should be divisible by 2"
217
+ super().__init__()
218
+ self.channels = channels
219
+ self.hidden_channels = hidden_channels
220
+ self.kernel_size = kernel_size
221
+ self.dilation_rate = dilation_rate
222
+ self.n_layers = n_layers
223
+ self.half_channels = channels // 2
224
+ self.mean_only = mean_only
225
+
226
+ self.pre = torch.nn.Conv1d(self.half_channels, hidden_channels, 1)
227
+ self.enc = WaveNet(
228
+ hidden_channels,
229
+ kernel_size,
230
+ dilation_rate,
231
+ n_layers,
232
+ p_dropout=p_dropout,
233
+ gin_channels=gin_channels,
234
+ )
235
+ self.post = torch.nn.Conv1d(
236
+ hidden_channels,
237
+ self.half_channels * (2 - mean_only),
238
+ 1,
239
+ )
240
+ self.post.weight.data.zero_()
241
+ self.post.bias.data.zero_()
242
+
243
+ def forward(
244
+ self,
245
+ x: torch.Tensor,
246
+ x_mask: torch.Tensor,
247
+ g: torch.Tensor | None = None,
248
+ reverse: bool = False,
249
+ ):
250
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
251
+ h = self.pre(x0) * x_mask
252
+ h = self.enc(h, x_mask, g=g)
253
+ stats = self.post(h) * x_mask
254
+ if not self.mean_only:
255
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
256
+ else:
257
+ m = stats
258
+ logs = torch.zeros_like(m)
259
+
260
+ if not reverse:
261
+ x1 = m + x1 * torch.exp(logs) * x_mask
262
+ x = torch.cat([x0, x1], 1)
263
+ logdet = torch.sum(logs, [1, 2])
264
+ return x, logdet
265
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
266
+ x = torch.cat([x0, x1], 1)
267
+ return x
268
+
269
+ def remove_weight_norm(self):
270
+ self.enc.remove_weight_norm()
271
+ rm()
rvc_logic/rvc/lib/algorithm/synthesizers.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import logging
4
+
5
+ import torch
6
+
7
+ from rvc_logic.rvc.lib.algorithm.commons import rand_slice_segments, slice_segments
8
+ from rvc_logic.rvc.lib.algorithm.encoders import PosteriorEncoder, TextEncoder
9
+ from rvc_logic.rvc.lib.algorithm.generators.hifigan import HiFiGANGenerator
10
+ from rvc_logic.rvc.lib.algorithm.generators.hifigan_mrf import HiFiGANMRFGenerator
11
+ from rvc_logic.rvc.lib.algorithm.generators.hifigan_nsf import HiFiGANNSFGenerator
12
+ from rvc_logic.rvc.lib.algorithm.generators.refinegan import RefineGANGenerator
13
+ from rvc_logic.rvc.lib.algorithm.residuals import ResidualCouplingBlock
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class Synthesizer(torch.nn.Module):
19
+ """
20
+ Base Synthesizer model.
21
+
22
+ Args:
23
+ spec_channels (int): Number of channels in the spectrogram.
24
+ segment_size (int): Size of the audio segment.
25
+ inter_channels (int): Number of channels in the intermediate layers.
26
+ hidden_channels (int): Number of channels in the hidden layers.
27
+ filter_channels (int): Number of channels in the filter layers.
28
+ n_heads (int): Number of attention heads.
29
+ n_layers (int): Number of layers in the encoder.
30
+ kernel_size (int): Size of the convolution kernel.
31
+ p_dropout (float): Dropout probability.
32
+ resblock (str): Type of residual block.
33
+ resblock_kernel_sizes (list): Kernel sizes for the residual blocks.
34
+ resblock_dilation_sizes (list): Dilation sizes for the residual blocks.
35
+ upsample_rates (list): Upsampling rates for the decoder.
36
+ upsample_initial_channel (int): Number of channels in the initial upsampling layer.
37
+ upsample_kernel_sizes (list): Kernel sizes for the upsampling layers.
38
+ spk_embed_dim (int): Dimension of the speaker embedding.
39
+ gin_channels (int): Number of channels in the global conditioning vector.
40
+ sr (int): Sampling rate of the audio.
41
+ use_f0 (bool): Whether to use F0 information.
42
+ text_enc_hidden_dim (int): Hidden dimension for the text encoder.
43
+ kwargs: Additional keyword arguments.
44
+
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ spec_channels: int,
50
+ segment_size: int,
51
+ inter_channels: int,
52
+ hidden_channels: int,
53
+ filter_channels: int,
54
+ n_heads: int,
55
+ n_layers: int,
56
+ kernel_size: int,
57
+ p_dropout: float,
58
+ resblock: str,
59
+ resblock_kernel_sizes: list,
60
+ resblock_dilation_sizes: list,
61
+ upsample_rates: list,
62
+ upsample_initial_channel: int,
63
+ upsample_kernel_sizes: list,
64
+ spk_embed_dim: int,
65
+ gin_channels: int,
66
+ sr: int,
67
+ use_f0: bool,
68
+ text_enc_hidden_dim: int = 768,
69
+ vocoder: str = "HiFi-GAN",
70
+ randomized: bool = True,
71
+ checkpointing: bool = False,
72
+ **kwargs,
73
+ ):
74
+ super().__init__()
75
+ self.segment_size = segment_size
76
+ self.use_f0 = use_f0
77
+ self.randomized = randomized
78
+
79
+ self.enc_p = TextEncoder(
80
+ inter_channels,
81
+ hidden_channels,
82
+ filter_channels,
83
+ n_heads,
84
+ n_layers,
85
+ kernel_size,
86
+ p_dropout,
87
+ text_enc_hidden_dim,
88
+ f0=use_f0,
89
+ )
90
+ logger.info("Using %s vocoder", vocoder)
91
+ if use_f0:
92
+ if vocoder == "MRF HiFi-GAN":
93
+ self.dec = HiFiGANMRFGenerator(
94
+ in_channel=inter_channels,
95
+ upsample_initial_channel=upsample_initial_channel,
96
+ upsample_rates=upsample_rates,
97
+ upsample_kernel_sizes=upsample_kernel_sizes,
98
+ resblock_kernel_sizes=resblock_kernel_sizes,
99
+ resblock_dilations=resblock_dilation_sizes,
100
+ gin_channels=gin_channels,
101
+ sample_rate=sr,
102
+ harmonic_num=8,
103
+ checkpointing=checkpointing,
104
+ )
105
+ elif vocoder == "RefineGAN":
106
+ self.dec = RefineGANGenerator(
107
+ sample_rate=sr,
108
+ downsample_rates=upsample_rates[::-1],
109
+ upsample_rates=upsample_rates,
110
+ start_channels=16,
111
+ num_mels=inter_channels,
112
+ checkpointing=checkpointing,
113
+ )
114
+ else:
115
+ self.dec = HiFiGANNSFGenerator(
116
+ inter_channels,
117
+ resblock_kernel_sizes,
118
+ resblock_dilation_sizes,
119
+ upsample_rates,
120
+ upsample_initial_channel,
121
+ upsample_kernel_sizes,
122
+ gin_channels=gin_channels,
123
+ sr=sr,
124
+ checkpointing=checkpointing,
125
+ )
126
+ elif vocoder == "MRF HiFi-GAN":
127
+ print("MRF HiFi-GAN does not support training without pitch guidance.")
128
+ self.dec = None
129
+ elif vocoder == "RefineGAN":
130
+ print("RefineGAN does not support training without pitch guidance.")
131
+ self.dec = None
132
+ else:
133
+ self.dec = HiFiGANGenerator(
134
+ inter_channels,
135
+ resblock_kernel_sizes,
136
+ resblock_dilation_sizes,
137
+ upsample_rates,
138
+ upsample_initial_channel,
139
+ upsample_kernel_sizes,
140
+ gin_channels=gin_channels,
141
+ )
142
+ self.enc_q = PosteriorEncoder(
143
+ spec_channels,
144
+ inter_channels,
145
+ hidden_channels,
146
+ 5,
147
+ 1,
148
+ 16,
149
+ gin_channels=gin_channels,
150
+ )
151
+ self.flow = ResidualCouplingBlock(
152
+ inter_channels,
153
+ hidden_channels,
154
+ 5,
155
+ 1,
156
+ 3,
157
+ gin_channels=gin_channels,
158
+ )
159
+ self.emb_g = torch.nn.Embedding(spk_embed_dim, gin_channels)
160
+
161
+ def _remove_weight_norm_from(self, module):
162
+ for hook in module._forward_pre_hooks.values():
163
+ if getattr(hook, "__class__", None).__name__ == "WeightNorm":
164
+ torch.nn.utils.remove_weight_norm(module)
165
+
166
+ def remove_weight_norm(self):
167
+ for module in [self.dec, self.flow, self.enc_q]:
168
+ self._remove_weight_norm_from(module)
169
+
170
+ def __prepare_scriptable__(self):
171
+ self.remove_weight_norm()
172
+ return self
173
+
174
+ def forward(
175
+ self,
176
+ phone: torch.Tensor,
177
+ phone_lengths: torch.Tensor,
178
+ pitch: torch.Tensor | None = None,
179
+ pitchf: torch.Tensor | None = None,
180
+ y: torch.Tensor | None = None,
181
+ y_lengths: torch.Tensor | None = None,
182
+ ds: torch.Tensor | None = None,
183
+ ):
184
+ g = self.emb_g(ds).unsqueeze(-1)
185
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
186
+
187
+ if y is not None:
188
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
189
+ z_p = self.flow(z, y_mask, g=g)
190
+ # regular old training method using random slices
191
+ if self.randomized:
192
+ z_slice, ids_slice = rand_slice_segments(
193
+ z,
194
+ y_lengths,
195
+ self.segment_size,
196
+ )
197
+ if self.use_f0:
198
+ pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2)
199
+ o = self.dec(z_slice, pitchf, g=g)
200
+ else:
201
+ o = self.dec(z_slice, g=g)
202
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
203
+ # future use for finetuning using the entire dataset each pass
204
+ if self.use_f0:
205
+ o = self.dec(z, pitchf, g=g)
206
+ else:
207
+ o = self.dec(z, g=g)
208
+ return o, None, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
209
+ return None, None, x_mask, None, (None, None, m_p, logs_p, None, None)
210
+
211
+ @torch.jit.export
212
+ def infer(
213
+ self,
214
+ phone: torch.Tensor,
215
+ phone_lengths: torch.Tensor,
216
+ pitch: torch.Tensor | None = None,
217
+ nsff0: torch.Tensor | None = None,
218
+ sid: torch.Tensor = None,
219
+ rate: torch.Tensor | None = None,
220
+ ):
221
+ """
222
+ Inference of the model.
223
+
224
+ Args:
225
+ phone (torch.Tensor): Phoneme sequence.
226
+ phone_lengths (torch.Tensor): Lengths of the phoneme sequences.
227
+ pitch (torch.Tensor, optional): Pitch sequence.
228
+ nsff0 (torch.Tensor, optional): Fine-grained pitch sequence.
229
+ sid (torch.Tensor): Speaker embedding.
230
+ rate (torch.Tensor, optional): Rate for time-stretching.
231
+
232
+ """
233
+ g = self.emb_g(sid).unsqueeze(-1)
234
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
235
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
236
+
237
+ if rate is not None:
238
+ head = int(z_p.shape[2] * (1.0 - rate.item()))
239
+ z_p, x_mask = z_p[:, :, head:], x_mask[:, :, head:]
240
+ if self.use_f0 and nsff0 is not None:
241
+ nsff0 = nsff0[:, head:]
242
+
243
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
244
+ o = (
245
+ self.dec(z * x_mask, nsff0, g=g)
246
+ if self.use_f0
247
+ else self.dec(z * x_mask, g=g)
248
+ )
249
+
250
+ return o, x_mask, (z, z_p, m_p, logs_p)
251
+ , z_p, m_p, logs_p)
rvc_logic/rvc/lib/predictors/F0Extractor.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import os
3
+ import pathlib
4
+
5
+ import resampy
6
+ import torchfcpe
7
+
8
+ import numpy as np
9
+
10
+ import torch
11
+ import torchcrepe
12
+
13
+ import librosa
14
+
15
+ from rvc_logic.common import RVC_MODELS_DIR
16
+ from rvc_logic.rvc.configs.config import Config
17
+
18
+ # from tools.anyf0.rmvpe import RMVPE
19
+ from rvc_logic.rvc.lib.predictors.RMVPE import RMVPE0Predictor
20
+
21
+ config = Config()
22
+
23
+
24
+ @dataclasses.dataclass
25
+ class F0Extractor:
26
+ wav_path: pathlib.Path
27
+ sample_rate: int = 44100
28
+ hop_length: int = 512
29
+ f0_min: int = 50
30
+ f0_max: int = 1600
31
+ method: str = "rmvpe"
32
+ x: np.ndarray = dataclasses.field(init=False)
33
+
34
+ def __post_init__(self):
35
+ self.x, self.sample_rate = librosa.load(self.wav_path, sr=self.sample_rate)
36
+
37
+ @property
38
+ def hop_size(self):
39
+ return self.hop_length / self.sample_rate
40
+
41
+ @property
42
+ def wav16k(self):
43
+ return resampy.resample(self.x, self.sample_rate, 16000)
44
+
45
+ def extract_f0(self):
46
+ f0 = None
47
+ method = self.method
48
+ if method == "crepe":
49
+ wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(config.device)
50
+ f0 = torchcrepe.predict(
51
+ wav16k_torch,
52
+ sample_rate=16000,
53
+ hop_length=160,
54
+ batch_size=512,
55
+ fmin=self.f0_min,
56
+ fmax=self.f0_max,
57
+ device=config.device,
58
+ )
59
+ f0 = f0[0].cpu().numpy()
60
+ elif method == "fcpe":
61
+ audio = librosa.to_mono(self.x)
62
+ audio_length = len(audio)
63
+ f0_target_length = (audio_length // self.hop_length) + 1
64
+ audio = (
65
+ torch.from_numpy(audio)
66
+ .float()
67
+ .unsqueeze(0)
68
+ .unsqueeze(-1)
69
+ .to(config.device)
70
+ )
71
+ model = torchfcpe.spawn_bundled_infer_model(device=config.device)
72
+
73
+ f0 = model.infer(
74
+ audio,
75
+ sr=self.sample_rate,
76
+ decoder_mode="local_argmax",
77
+ threshold=0.006,
78
+ f0_min=self.f0_min,
79
+ f0_max=self.f0_max,
80
+ interp_uv=False,
81
+ output_interp_target_length=f0_target_length,
82
+ )
83
+ f0 = f0.squeeze().cpu().numpy()
84
+ elif method == "rmvpe":
85
+ model_rmvpe = RMVPE0Predictor(
86
+ os.path.join(str(RVC_MODELS_DIR), "predictors", "rmvpe.pt"),
87
+ device=config.device,
88
+ )
89
+ f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03)
90
+
91
+ else:
92
+ raise ValueError(f"Unknown method: {self.method}")
93
+ return self.hz_to_cents(f0, librosa.midi_to_hz(0))
94
+
95
+ def plot_f0(self, f0):
96
+ from matplotlib import pyplot as plt
97
+
98
+ plt.figure(figsize=(10, 4))
99
+ plt.plot(f0)
100
+ plt.title(self.method)
101
+ plt.xlabel("Time (frames)")
102
+ plt.ylabel("F0 (cents)")
103
+ plt.show()
104
+
105
+ @staticmethod
106
+ def hz_to_cents(F, F_ref=55.0):
107
+ F_temp = np.array(F).astype(float)
108
+ F_temp[F_temp == 0] = np.nan
109
+ F_cents = 1200 * np.log2(F_temp / F_ref)
110
+ return F_cents
111
+ F_cents
rvc_logic/rvc/lib/predictors/FCPE.py ADDED
@@ -0,0 +1,965 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ from functools import partial
4
+
5
+ import numpy as np
6
+
7
+ import torch
8
+ import torch.nn.functional as F
9
+ import torch.utils.data
10
+ from einops import rearrange, repeat
11
+ from local_attention import LocalAttention
12
+ from torch import nn
13
+ from torch.nn.utils.parametrizations import weight_norm
14
+ from torchaudio.transforms import Resample
15
+
16
+ import librosa
17
+ import soundfile as sf
18
+ from librosa.filters import mel as librosa_mel_fn
19
+
20
+ os.environ["LRU_CACHE_CAPACITY"] = "3"
21
+
22
+
23
+ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
24
+ """Loads wav file to torch tensor."""
25
+ try:
26
+ data, sample_rate = sf.read(full_path, always_2d=True)
27
+ except Exception as error:
28
+ print(f"An error occurred loading {full_path}: {error}")
29
+ if return_empty_on_exception:
30
+ return [], sample_rate or target_sr or 48000
31
+ raise
32
+
33
+ data = data[:, 0] if len(data.shape) > 1 else data
34
+ assert len(data) > 2
35
+
36
+ # Normalize data
37
+ max_mag = (
38
+ -np.iinfo(data.dtype).min
39
+ if np.issubdtype(data.dtype, np.integer)
40
+ else max(np.amax(data), -np.amin(data))
41
+ )
42
+ max_mag = (
43
+ (2**31) + 1 if max_mag > (2**15) else ((2**15) + 1 if max_mag > 1.01 else 1.0)
44
+ )
45
+ data = torch.FloatTensor(data.astype(np.float32)) / max_mag
46
+
47
+ # Handle exceptions and resample
48
+ if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:
49
+ return [], sample_rate or target_sr or 48000
50
+ if target_sr is not None and sample_rate != target_sr:
51
+ data = torch.from_numpy(
52
+ librosa.core.resample(
53
+ data.numpy(),
54
+ orig_sr=sample_rate,
55
+ target_sr=target_sr,
56
+ ),
57
+ )
58
+ sample_rate = target_sr
59
+
60
+ return data, sample_rate
61
+
62
+
63
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
64
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
65
+
66
+
67
+ def dynamic_range_decompression(x, C=1):
68
+ return np.exp(x) / C
69
+
70
+
71
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
72
+ return torch.log(torch.clamp(x, min=clip_val) * C)
73
+
74
+
75
+ def dynamic_range_decompression_torch(x, C=1):
76
+ return torch.exp(x) / C
77
+
78
+
79
+ class STFT:
80
+ def __init__(
81
+ self,
82
+ sr=22050,
83
+ n_mels=80,
84
+ n_fft=1024,
85
+ win_size=1024,
86
+ hop_length=256,
87
+ fmin=20,
88
+ fmax=11025,
89
+ clip_val=1e-5,
90
+ ):
91
+ self.target_sr = sr
92
+ self.n_mels = n_mels
93
+ self.n_fft = n_fft
94
+ self.win_size = win_size
95
+ self.hop_length = hop_length
96
+ self.fmin = fmin
97
+ self.fmax = fmax
98
+ self.clip_val = clip_val
99
+ self.mel_basis = {}
100
+ self.hann_window = {}
101
+
102
+ def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
103
+ sample_rate = self.target_sr
104
+ n_mels = self.n_mels
105
+ n_fft = self.n_fft
106
+ win_size = self.win_size
107
+ hop_length = self.hop_length
108
+ fmin = self.fmin
109
+ fmax = self.fmax
110
+ clip_val = self.clip_val
111
+
112
+ factor = 2 ** (keyshift / 12)
113
+ n_fft_new = int(np.round(n_fft * factor))
114
+ win_size_new = int(np.round(win_size * factor))
115
+ hop_length_new = int(np.round(hop_length * speed))
116
+
117
+ # Optimize mel_basis and hann_window caching
118
+ mel_basis = self.mel_basis if not train else {}
119
+ hann_window = self.hann_window if not train else {}
120
+
121
+ mel_basis_key = str(fmax) + "_" + str(y.device)
122
+ if mel_basis_key not in mel_basis:
123
+ mel = librosa_mel_fn(
124
+ sr=sample_rate,
125
+ n_fft=n_fft,
126
+ n_mels=n_mels,
127
+ fmin=fmin,
128
+ fmax=fmax,
129
+ )
130
+ mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
131
+
132
+ keyshift_key = str(keyshift) + "_" + str(y.device)
133
+ if keyshift_key not in hann_window:
134
+ hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
135
+
136
+ # Padding and STFT
137
+ pad_left = (win_size_new - hop_length_new) // 2
138
+ pad_right = max(
139
+ (win_size_new - hop_length_new + 1) // 2,
140
+ win_size_new - y.size(-1) - pad_left,
141
+ )
142
+ mode = "reflect" if pad_right < y.size(-1) else "constant"
143
+ y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode)
144
+ y = y.squeeze(1)
145
+
146
+ spec = torch.stft(
147
+ y,
148
+ n_fft=n_fft_new,
149
+ hop_length=hop_length_new,
150
+ win_length=win_size_new,
151
+ window=hann_window[keyshift_key],
152
+ center=center,
153
+ pad_mode="reflect",
154
+ normalized=False,
155
+ onesided=True,
156
+ return_complex=True,
157
+ )
158
+ spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
159
+
160
+ # Handle keyshift and mel conversion
161
+ if keyshift != 0:
162
+ size = n_fft // 2 + 1
163
+ resize = spec.size(1)
164
+ spec = (
165
+ F.pad(spec, (0, 0, 0, size - resize))
166
+ if resize < size
167
+ else spec[:, :size, :]
168
+ )
169
+ spec = spec * win_size / win_size_new
170
+ spec = torch.matmul(mel_basis[mel_basis_key], spec)
171
+ spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
172
+ return spec
173
+
174
+ def __call__(self, audiopath):
175
+ audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
176
+ spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
177
+ return spect
178
+
179
+
180
+ stft = STFT()
181
+
182
+
183
+ def softmax_kernel(
184
+ data,
185
+ *,
186
+ projection_matrix,
187
+ is_query,
188
+ normalize_data=True,
189
+ eps=1e-4,
190
+ device=None,
191
+ ):
192
+ b, h, *_ = data.shape
193
+
194
+ # Normalize data
195
+ data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0
196
+
197
+ # Project data
198
+ ratio = projection_matrix.shape[0] ** -0.5
199
+ projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h)
200
+ projection = projection.type_as(data)
201
+ data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection)
202
+
203
+ # Calculate diagonal data
204
+ diag_data = data**2
205
+ diag_data = torch.sum(diag_data, dim=-1)
206
+ diag_data = (diag_data / 2.0) * (data_normalizer**2)
207
+ diag_data = diag_data.unsqueeze(dim=-1)
208
+
209
+ # Apply softmax
210
+ if is_query:
211
+ data_dash = ratio * (
212
+ torch.exp(
213
+ data_dash
214
+ - diag_data
215
+ - torch.max(data_dash, dim=-1, keepdim=True).values,
216
+ )
217
+ + eps
218
+ )
219
+ else:
220
+ data_dash = ratio * (torch.exp(data_dash - diag_data + eps))
221
+
222
+ return data_dash.type_as(data)
223
+
224
+
225
+ def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None):
226
+ unstructured_block = torch.randn((cols, cols), device=device)
227
+ q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced")
228
+ q, r = map(lambda t: t.to(device), (q, r))
229
+
230
+ if qr_uniform_q:
231
+ d = torch.diag(r, 0)
232
+ q *= d.sign()
233
+ return q.t()
234
+
235
+
236
+ def exists(val):
237
+ return val is not None
238
+
239
+
240
+ def empty(tensor):
241
+ return tensor.numel() == 0
242
+
243
+
244
+ def default(val, d):
245
+ return val if exists(val) else d
246
+
247
+
248
+ def cast_tuple(val):
249
+ return (val,) if not isinstance(val, tuple) else val
250
+
251
+
252
+ class PCmer(nn.Module):
253
+ def __init__(
254
+ self,
255
+ num_layers,
256
+ num_heads,
257
+ dim_model,
258
+ dim_keys,
259
+ dim_values,
260
+ residual_dropout,
261
+ attention_dropout,
262
+ ):
263
+ super().__init__()
264
+ self.num_layers = num_layers
265
+ self.num_heads = num_heads
266
+ self.dim_model = dim_model
267
+ self.dim_values = dim_values
268
+ self.dim_keys = dim_keys
269
+ self.residual_dropout = residual_dropout
270
+ self.attention_dropout = attention_dropout
271
+
272
+ self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])
273
+
274
+ def forward(self, phone, mask=None):
275
+ for layer in self._layers:
276
+ phone = layer(phone, mask)
277
+ return phone
278
+
279
+
280
+ class _EncoderLayer(nn.Module):
281
+ def __init__(self, parent: PCmer):
282
+ super().__init__()
283
+ self.conformer = ConformerConvModule(parent.dim_model)
284
+ self.norm = nn.LayerNorm(parent.dim_model)
285
+ self.dropout = nn.Dropout(parent.residual_dropout)
286
+ self.attn = SelfAttention(
287
+ dim=parent.dim_model,
288
+ heads=parent.num_heads,
289
+ causal=False,
290
+ )
291
+
292
+ def forward(self, phone, mask=None):
293
+ phone = phone + (self.attn(self.norm(phone), mask=mask))
294
+ phone = phone + (self.conformer(phone))
295
+ return phone
296
+
297
+
298
+ def calc_same_padding(kernel_size):
299
+ pad = kernel_size // 2
300
+ return (pad, pad - (kernel_size + 1) % 2)
301
+
302
+
303
+ class Swish(nn.Module):
304
+ def forward(self, x):
305
+ return x * x.sigmoid()
306
+
307
+
308
+ class Transpose(nn.Module):
309
+ def __init__(self, dims):
310
+ super().__init__()
311
+ assert len(dims) == 2, "dims must be a tuple of two dimensions"
312
+ self.dims = dims
313
+
314
+ def forward(self, x):
315
+ return x.transpose(*self.dims)
316
+
317
+
318
+ class GLU(nn.Module):
319
+ def __init__(self, dim):
320
+ super().__init__()
321
+ self.dim = dim
322
+
323
+ def forward(self, x):
324
+ out, gate = x.chunk(2, dim=self.dim)
325
+ return out * gate.sigmoid()
326
+
327
+
328
+ class DepthWiseConv1d(nn.Module):
329
+ def __init__(self, chan_in, chan_out, kernel_size, padding):
330
+ super().__init__()
331
+ self.padding = padding
332
+ self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in)
333
+
334
+ def forward(self, x):
335
+ x = F.pad(x, self.padding)
336
+ return self.conv(x)
337
+
338
+
339
+ class ConformerConvModule(nn.Module):
340
+ def __init__(
341
+ self,
342
+ dim,
343
+ causal=False,
344
+ expansion_factor=2,
345
+ kernel_size=31,
346
+ dropout=0.0,
347
+ ):
348
+ super().__init__()
349
+
350
+ inner_dim = dim * expansion_factor
351
+ padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
352
+
353
+ self.net = nn.Sequential(
354
+ nn.LayerNorm(dim),
355
+ Transpose((1, 2)),
356
+ nn.Conv1d(dim, inner_dim * 2, 1),
357
+ GLU(dim=1),
358
+ DepthWiseConv1d(
359
+ inner_dim,
360
+ inner_dim,
361
+ kernel_size=kernel_size,
362
+ padding=padding,
363
+ ),
364
+ Swish(),
365
+ nn.Conv1d(inner_dim, dim, 1),
366
+ Transpose((1, 2)),
367
+ nn.Dropout(dropout),
368
+ )
369
+
370
+ def forward(self, x):
371
+ return self.net(x)
372
+
373
+
374
+ def linear_attention(q, k, v):
375
+ if v is None:
376
+ out = torch.einsum("...ed,...nd->...ne", k, q)
377
+ return out
378
+ k_cumsum = k.sum(dim=-2)
379
+ D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8)
380
+ context = torch.einsum("...nd,...ne->...de", k, v)
381
+ out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv)
382
+ return out
383
+
384
+
385
+ def gaussian_orthogonal_random_matrix(
386
+ nb_rows,
387
+ nb_columns,
388
+ scaling=0,
389
+ qr_uniform_q=False,
390
+ device=None,
391
+ ):
392
+ nb_full_blocks = int(nb_rows / nb_columns)
393
+ block_list = []
394
+
395
+ for _ in range(nb_full_blocks):
396
+ q = orthogonal_matrix_chunk(
397
+ nb_columns,
398
+ qr_uniform_q=qr_uniform_q,
399
+ device=device,
400
+ )
401
+ block_list.append(q)
402
+
403
+ remaining_rows = nb_rows - nb_full_blocks * nb_columns
404
+ if remaining_rows > 0:
405
+ q = orthogonal_matrix_chunk(
406
+ nb_columns,
407
+ qr_uniform_q=qr_uniform_q,
408
+ device=device,
409
+ )
410
+ block_list.append(q[:remaining_rows])
411
+
412
+ final_matrix = torch.cat(block_list)
413
+
414
+ if scaling == 0:
415
+ multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1)
416
+ elif scaling == 1:
417
+ multiplier = math.sqrt(float(nb_columns)) * torch.ones(
418
+ (nb_rows,),
419
+ device=device,
420
+ )
421
+ else:
422
+ raise ValueError(f"Invalid scaling {scaling}")
423
+
424
+ return torch.diag(multiplier) @ final_matrix
425
+
426
+
427
+ class FastAttention(nn.Module):
428
+ def __init__(
429
+ self,
430
+ dim_heads,
431
+ nb_features=None,
432
+ ortho_scaling=0,
433
+ causal=False,
434
+ generalized_attention=False,
435
+ kernel_fn=nn.ReLU(),
436
+ qr_uniform_q=False,
437
+ no_projection=False,
438
+ ):
439
+ super().__init__()
440
+ nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
441
+
442
+ self.dim_heads = dim_heads
443
+ self.nb_features = nb_features
444
+ self.ortho_scaling = ortho_scaling
445
+
446
+ self.create_projection = partial(
447
+ gaussian_orthogonal_random_matrix,
448
+ nb_rows=self.nb_features,
449
+ nb_columns=dim_heads,
450
+ scaling=ortho_scaling,
451
+ qr_uniform_q=qr_uniform_q,
452
+ )
453
+ projection_matrix = self.create_projection()
454
+ self.register_buffer("projection_matrix", projection_matrix)
455
+
456
+ self.generalized_attention = generalized_attention
457
+ self.kernel_fn = kernel_fn
458
+ self.no_projection = no_projection
459
+ self.causal = causal
460
+
461
+ @torch.no_grad()
462
+ def redraw_projection_matrix(self):
463
+ projections = self.create_projection()
464
+ self.projection_matrix.copy_(projections)
465
+ del projections
466
+
467
+ def forward(self, q, k, v):
468
+ device = q.device
469
+
470
+ if self.no_projection:
471
+ q = q.softmax(dim=-1)
472
+ k = torch.exp(k) if self.causal else k.softmax(dim=-2)
473
+ else:
474
+ create_kernel = partial(
475
+ softmax_kernel,
476
+ projection_matrix=self.projection_matrix,
477
+ device=device,
478
+ )
479
+ q = create_kernel(q, is_query=True)
480
+ k = create_kernel(k, is_query=False)
481
+
482
+ attn_fn = linear_attention if not self.causal else self.causal_linear_fn
483
+
484
+ if v is None:
485
+ out = attn_fn(q, k, None)
486
+ return out
487
+ out = attn_fn(q, k, v)
488
+ return out
489
+
490
+
491
+ class SelfAttention(nn.Module):
492
+ def __init__(
493
+ self,
494
+ dim,
495
+ causal=False,
496
+ heads=8,
497
+ dim_head=64,
498
+ local_heads=0,
499
+ local_window_size=256,
500
+ nb_features=None,
501
+ feature_redraw_interval=1000,
502
+ generalized_attention=False,
503
+ kernel_fn=nn.ReLU(),
504
+ qr_uniform_q=False,
505
+ dropout=0.0,
506
+ no_projection=False,
507
+ ):
508
+ super().__init__()
509
+ assert dim % heads == 0, "dimension must be divisible by number of heads"
510
+ dim_head = default(dim_head, dim // heads)
511
+ inner_dim = dim_head * heads
512
+ self.fast_attention = FastAttention(
513
+ dim_head,
514
+ nb_features,
515
+ causal=causal,
516
+ generalized_attention=generalized_attention,
517
+ kernel_fn=kernel_fn,
518
+ qr_uniform_q=qr_uniform_q,
519
+ no_projection=no_projection,
520
+ )
521
+
522
+ self.heads = heads
523
+ self.global_heads = heads - local_heads
524
+ self.local_attn = (
525
+ LocalAttention(
526
+ window_size=local_window_size,
527
+ causal=causal,
528
+ autopad=True,
529
+ dropout=dropout,
530
+ look_forward=int(not causal),
531
+ rel_pos_emb_config=(dim_head, local_heads),
532
+ )
533
+ if local_heads > 0
534
+ else None
535
+ )
536
+
537
+ self.to_q = nn.Linear(dim, inner_dim)
538
+ self.to_k = nn.Linear(dim, inner_dim)
539
+ self.to_v = nn.Linear(dim, inner_dim)
540
+ self.to_out = nn.Linear(inner_dim, dim)
541
+ self.dropout = nn.Dropout(dropout)
542
+
543
+ @torch.no_grad()
544
+ def redraw_projection_matrix(self):
545
+ self.fast_attention.redraw_projection_matrix()
546
+
547
+ def forward(
548
+ self,
549
+ x,
550
+ context=None,
551
+ mask=None,
552
+ context_mask=None,
553
+ name=None,
554
+ inference=False,
555
+ **kwargs,
556
+ ):
557
+ _, _, _, h, gh = *x.shape, self.heads, self.global_heads
558
+
559
+ cross_attend = exists(context)
560
+ context = default(context, x)
561
+ context_mask = default(context_mask, mask) if not cross_attend else context_mask
562
+ q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
563
+
564
+ q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
565
+ (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
566
+
567
+ attn_outs = []
568
+ if not empty(q):
569
+ if exists(context_mask):
570
+ global_mask = context_mask[:, None, :, None]
571
+ v.masked_fill_(~global_mask, 0.0)
572
+ if cross_attend:
573
+ pass # TODO: Implement cross-attention
574
+ else:
575
+ out = self.fast_attention(q, k, v)
576
+ attn_outs.append(out)
577
+
578
+ if not empty(lq):
579
+ assert (
580
+ not cross_attend
581
+ ), "local attention is not compatible with cross attention"
582
+ out = self.local_attn(lq, lk, lv, input_mask=mask)
583
+ attn_outs.append(out)
584
+
585
+ out = torch.cat(attn_outs, dim=1)
586
+ out = rearrange(out, "b h n d -> b n (h d)")
587
+ out = self.to_out(out)
588
+ return self.dropout(out)
589
+
590
+
591
+ def l2_regularization(model, l2_alpha):
592
+ l2_loss = []
593
+ for module in model.modules():
594
+ if type(module) is nn.Conv2d:
595
+ l2_loss.append((module.weight**2).sum() / 2.0)
596
+ return l2_alpha * sum(l2_loss)
597
+
598
+
599
+ class FCPE(nn.Module):
600
+ def __init__(
601
+ self,
602
+ input_channel=128,
603
+ out_dims=360,
604
+ n_layers=12,
605
+ n_chans=512,
606
+ use_siren=False,
607
+ use_full=False,
608
+ loss_mse_scale=10,
609
+ loss_l2_regularization=False,
610
+ loss_l2_regularization_scale=1,
611
+ loss_grad1_mse=False,
612
+ loss_grad1_mse_scale=1,
613
+ f0_max=1975.5,
614
+ f0_min=32.70,
615
+ confidence=False,
616
+ threshold=0.05,
617
+ use_input_conv=True,
618
+ ):
619
+ super().__init__()
620
+ if use_siren is True:
621
+ raise ValueError("Siren is not supported yet.")
622
+ if use_full is True:
623
+ raise ValueError("Full model is not supported yet.")
624
+
625
+ self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10
626
+ self.loss_l2_regularization = (
627
+ loss_l2_regularization if (loss_l2_regularization is not None) else False
628
+ )
629
+ self.loss_l2_regularization_scale = (
630
+ loss_l2_regularization_scale
631
+ if (loss_l2_regularization_scale is not None)
632
+ else 1
633
+ )
634
+ self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False
635
+ self.loss_grad1_mse_scale = (
636
+ loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1
637
+ )
638
+ self.f0_max = f0_max if (f0_max is not None) else 1975.5
639
+ self.f0_min = f0_min if (f0_min is not None) else 32.70
640
+ self.confidence = confidence if (confidence is not None) else False
641
+ self.threshold = threshold if (threshold is not None) else 0.05
642
+ self.use_input_conv = use_input_conv if (use_input_conv is not None) else True
643
+
644
+ self.cent_table_b = torch.Tensor(
645
+ np.linspace(
646
+ self.f0_to_cent(torch.Tensor([f0_min]))[0],
647
+ self.f0_to_cent(torch.Tensor([f0_max]))[0],
648
+ out_dims,
649
+ ),
650
+ )
651
+ self.register_buffer("cent_table", self.cent_table_b)
652
+
653
+ # conv in stack
654
+ _leaky = nn.LeakyReLU()
655
+ self.stack = nn.Sequential(
656
+ nn.Conv1d(input_channel, n_chans, 3, 1, 1),
657
+ nn.GroupNorm(4, n_chans),
658
+ _leaky,
659
+ nn.Conv1d(n_chans, n_chans, 3, 1, 1),
660
+ )
661
+
662
+ # transformer
663
+ self.decoder = PCmer(
664
+ num_layers=n_layers,
665
+ num_heads=8,
666
+ dim_model=n_chans,
667
+ dim_keys=n_chans,
668
+ dim_values=n_chans,
669
+ residual_dropout=0.1,
670
+ attention_dropout=0.1,
671
+ )
672
+ self.norm = nn.LayerNorm(n_chans)
673
+
674
+ # out
675
+ self.n_out = out_dims
676
+ self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out))
677
+
678
+ def forward(
679
+ self,
680
+ mel,
681
+ infer=True,
682
+ gt_f0=None,
683
+ return_hz_f0=False,
684
+ cdecoder="local_argmax",
685
+ ):
686
+ if cdecoder == "argmax":
687
+ self.cdecoder = self.cents_decoder
688
+ elif cdecoder == "local_argmax":
689
+ self.cdecoder = self.cents_local_decoder
690
+
691
+ x = (
692
+ self.stack(mel.transpose(1, 2)).transpose(1, 2)
693
+ if self.use_input_conv
694
+ else mel
695
+ )
696
+ x = self.decoder(x)
697
+ x = self.norm(x)
698
+ x = self.dense_out(x)
699
+ x = torch.sigmoid(x)
700
+
701
+ if not infer:
702
+ gt_cent_f0 = self.f0_to_cent(gt_f0)
703
+ gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0)
704
+ loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0)
705
+ if self.loss_l2_regularization:
706
+ loss_all = loss_all + l2_regularization(
707
+ model=self,
708
+ l2_alpha=self.loss_l2_regularization_scale,
709
+ )
710
+ x = loss_all
711
+ if infer:
712
+ x = self.cdecoder(x)
713
+ x = self.cent_to_f0(x)
714
+ x = (1 + x / 700).log() if not return_hz_f0 else x
715
+
716
+ return x
717
+
718
+ def cents_decoder(self, y, mask=True):
719
+ B, N, _ = y.size()
720
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
721
+ rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(
722
+ y,
723
+ dim=-1,
724
+ keepdim=True,
725
+ )
726
+ if mask:
727
+ confident = torch.max(y, dim=-1, keepdim=True)[0]
728
+ confident_mask = torch.ones_like(confident)
729
+ confident_mask[confident <= self.threshold] = float("-INF")
730
+ rtn = rtn * confident_mask
731
+ return (rtn, confident) if self.confidence else rtn
732
+
733
+ def cents_local_decoder(self, y, mask=True):
734
+ B, N, _ = y.size()
735
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
736
+ confident, max_index = torch.max(y, dim=-1, keepdim=True)
737
+ local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4)
738
+ local_argmax_index = torch.clamp(local_argmax_index, 0, self.n_out - 1)
739
+ ci_l = torch.gather(ci, -1, local_argmax_index)
740
+ y_l = torch.gather(y, -1, local_argmax_index)
741
+ rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum(
742
+ y_l,
743
+ dim=-1,
744
+ keepdim=True,
745
+ )
746
+ if mask:
747
+ confident_mask = torch.ones_like(confident)
748
+ confident_mask[confident <= self.threshold] = float("-INF")
749
+ rtn = rtn * confident_mask
750
+ return (rtn, confident) if self.confidence else rtn
751
+
752
+ def cent_to_f0(self, cent):
753
+ return 10.0 * 2 ** (cent / 1200.0)
754
+
755
+ def f0_to_cent(self, f0):
756
+ return 1200.0 * torch.log2(f0 / 10.0)
757
+
758
+ def gaussian_blurred_cent(self, cents):
759
+ mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0)))
760
+ B, N, _ = cents.size()
761
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
762
+ return torch.exp(-torch.square(ci - cents) / 1250) * mask.float()
763
+
764
+
765
+ class FCPEInfer:
766
+ def __init__(self, model_path, device=None, dtype=torch.float32):
767
+ if device is None:
768
+ device = "cuda" if torch.cuda.is_available() else "cpu"
769
+ self.device = device
770
+ ckpt = torch.load(
771
+ model_path,
772
+ map_location=torch.device(self.device),
773
+ weights_only=False,
774
+ )
775
+ self.args = DotDict(ckpt["config"])
776
+ self.dtype = dtype
777
+ model = FCPE(
778
+ input_channel=self.args.model.input_channel,
779
+ out_dims=self.args.model.out_dims,
780
+ n_layers=self.args.model.n_layers,
781
+ n_chans=self.args.model.n_chans,
782
+ use_siren=self.args.model.use_siren,
783
+ use_full=self.args.model.use_full,
784
+ loss_mse_scale=self.args.loss.loss_mse_scale,
785
+ loss_l2_regularization=self.args.loss.loss_l2_regularization,
786
+ loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale,
787
+ loss_grad1_mse=self.args.loss.loss_grad1_mse,
788
+ loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale,
789
+ f0_max=self.args.model.f0_max,
790
+ f0_min=self.args.model.f0_min,
791
+ confidence=self.args.model.confidence,
792
+ )
793
+ model.to(self.device).to(self.dtype)
794
+ model.load_state_dict(ckpt["model"])
795
+ model.eval()
796
+ self.model = model
797
+ self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device)
798
+
799
+ @torch.no_grad()
800
+ def __call__(self, audio, sr, threshold=0.05):
801
+ self.model.threshold = threshold
802
+ audio = audio[None, :]
803
+ mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype)
804
+ f0 = self.model(mel=mel, infer=True, return_hz_f0=True)
805
+ return f0
806
+
807
+
808
+ class Wav2Mel:
809
+ def __init__(self, args, device=None, dtype=torch.float32):
810
+ self.sample_rate = args.mel.sampling_rate
811
+ self.hop_size = args.mel.hop_size
812
+ if device is None:
813
+ device = "cuda" if torch.cuda.is_available() else "cpu"
814
+ self.device = device
815
+ self.dtype = dtype
816
+ self.stft = STFT(
817
+ args.mel.sampling_rate,
818
+ args.mel.num_mels,
819
+ args.mel.n_fft,
820
+ args.mel.win_size,
821
+ args.mel.hop_size,
822
+ args.mel.fmin,
823
+ args.mel.fmax,
824
+ )
825
+ self.resample_kernel = {}
826
+
827
+ def extract_nvstft(self, audio, keyshift=0, train=False):
828
+ mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2)
829
+ return mel
830
+
831
+ def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
832
+ audio = audio.to(self.dtype).to(self.device)
833
+ if sample_rate == self.sample_rate:
834
+ audio_res = audio
835
+ else:
836
+ key_str = str(sample_rate)
837
+ if key_str not in self.resample_kernel:
838
+ self.resample_kernel[key_str] = Resample(
839
+ sample_rate,
840
+ self.sample_rate,
841
+ lowpass_filter_width=128,
842
+ )
843
+ self.resample_kernel[key_str] = (
844
+ self.resample_kernel[key_str].to(self.dtype).to(self.device)
845
+ )
846
+ audio_res = self.resample_kernel[key_str](audio)
847
+
848
+ mel = self.extract_nvstft(
849
+ audio_res,
850
+ keyshift=keyshift,
851
+ train=train,
852
+ ) # B, n_frames, bins
853
+ n_frames = int(audio.shape[1] // self.hop_size) + 1
854
+ mel = (
855
+ torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel
856
+ )
857
+ mel = mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel
858
+ return mel
859
+
860
+ def __call__(self, audio, sample_rate, keyshift=0, train=False):
861
+ return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)
862
+
863
+
864
+ class DotDict(dict):
865
+ def __getattr__(*args):
866
+ val = dict.get(*args)
867
+ return DotDict(val) if type(val) is dict else val
868
+
869
+ __setattr__ = dict.__setitem__
870
+ __delattr__ = dict.__delitem__
871
+
872
+
873
+ class F0Predictor:
874
+ def compute_f0(self, wav, p_len):
875
+ pass
876
+
877
+ def compute_f0_uv(self, wav, p_len):
878
+ pass
879
+
880
+
881
+ class FCPEF0Predictor(F0Predictor):
882
+ def __init__(
883
+ self,
884
+ model_path,
885
+ hop_length=512,
886
+ f0_min=50,
887
+ f0_max=1100,
888
+ dtype=torch.float32,
889
+ device=None,
890
+ sample_rate=44100,
891
+ threshold=0.05,
892
+ ):
893
+ self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype)
894
+ self.hop_length = hop_length
895
+ self.f0_min = f0_min
896
+ self.f0_max = f0_max
897
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
898
+ self.threshold = threshold
899
+ self.sample_rate = sample_rate
900
+ self.dtype = dtype
901
+ self.name = "fcpe"
902
+
903
+ def repeat_expand(
904
+ self,
905
+ content: torch.Tensor | np.ndarray,
906
+ target_len: int,
907
+ mode: str = "nearest",
908
+ ):
909
+ ndim = content.ndim
910
+ content = (
911
+ content[None, None]
912
+ if ndim == 1
913
+ else content[None] if ndim == 2 else content
914
+ )
915
+ assert content.ndim == 3
916
+ is_np = isinstance(content, np.ndarray)
917
+ content = torch.from_numpy(content) if is_np else content
918
+ results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
919
+ results = results.numpy() if is_np else results
920
+ return results[0, 0] if ndim == 1 else results[0] if ndim == 2 else results
921
+
922
+ def post_process(self, x, sample_rate, f0, pad_to):
923
+ f0 = (
924
+ torch.from_numpy(f0).float().to(x.device)
925
+ if isinstance(f0, np.ndarray)
926
+ else f0
927
+ )
928
+ f0 = self.repeat_expand(f0, pad_to) if pad_to is not None else f0
929
+
930
+ vuv_vector = torch.zeros_like(f0)
931
+ vuv_vector[f0 > 0.0] = 1.0
932
+ vuv_vector[f0 <= 0.0] = 0.0
933
+
934
+ nzindex = torch.nonzero(f0).squeeze()
935
+ f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
936
+ time_org = self.hop_length / sample_rate * nzindex.cpu().numpy()
937
+ time_frame = np.arange(pad_to) * self.hop_length / sample_rate
938
+
939
+ vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0]
940
+
941
+ if f0.shape[0] <= 0:
942
+ return np.zeros(pad_to), vuv_vector.cpu().numpy()
943
+ if f0.shape[0] == 1:
944
+ return np.ones(pad_to) * f0[0], vuv_vector.cpu().numpy()
945
+
946
+ f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
947
+ return f0, vuv_vector.cpu().numpy()
948
+
949
+ def compute_f0(self, wav, p_len=None):
950
+ x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
951
+ p_len = x.shape[0] // self.hop_length if p_len is None else p_len
952
+ f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
953
+ if torch.all(f0 == 0):
954
+ return f0.cpu().numpy() if p_len is None else np.zeros(p_len)
955
+ return self.post_process(x, self.sample_rate, f0, p_len)[0]
956
+
957
+ def compute_f0_uv(self, wav, p_len=None):
958
+ x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
959
+ p_len = x.shape[0] // self.hop_length if p_len is None else p_len
960
+ f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
961
+ if torch.all(f0 == 0):
962
+ return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (
963
+ f0.cpu().numpy() if p_len is None else np.zeros(p_len)
964
+ )
965
+ return self.post_process(x, self.sample_rate, f0, p_len)
rvc_logic/rvc/lib/predictors/RMVPE.py ADDED
@@ -0,0 +1,604 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from torch import nn
6
+
7
+ from librosa.filters import mel
8
+
9
+ N_MELS = 128
10
+ N_CLASS = 360
11
+
12
+
13
+ class ConvBlockRes(nn.Module):
14
+ """
15
+ A convolutional block with residual connection.
16
+
17
+ Args:
18
+ in_channels (int): Number of input channels.
19
+ out_channels (int): Number of output channels.
20
+ momentum (float): Momentum for batch normalization.
21
+
22
+ """
23
+
24
+ def __init__(self, in_channels, out_channels, momentum=0.01):
25
+ super().__init__()
26
+ self.conv = nn.Sequential(
27
+ nn.Conv2d(
28
+ in_channels=in_channels,
29
+ out_channels=out_channels,
30
+ kernel_size=(3, 3),
31
+ stride=(1, 1),
32
+ padding=(1, 1),
33
+ bias=False,
34
+ ),
35
+ nn.BatchNorm2d(out_channels, momentum=momentum),
36
+ nn.ReLU(),
37
+ nn.Conv2d(
38
+ in_channels=out_channels,
39
+ out_channels=out_channels,
40
+ kernel_size=(3, 3),
41
+ stride=(1, 1),
42
+ padding=(1, 1),
43
+ bias=False,
44
+ ),
45
+ nn.BatchNorm2d(out_channels, momentum=momentum),
46
+ nn.ReLU(),
47
+ )
48
+ if in_channels != out_channels:
49
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
50
+ self.is_shortcut = True
51
+ else:
52
+ self.is_shortcut = False
53
+
54
+ def forward(self, x):
55
+ if self.is_shortcut:
56
+ return self.conv(x) + self.shortcut(x)
57
+ return self.conv(x) + x
58
+
59
+
60
+ class ResEncoderBlock(nn.Module):
61
+ """
62
+ A residual encoder block.
63
+
64
+ Args:
65
+ in_channels (int): Number of input channels.
66
+ out_channels (int): Number of output channels.
67
+ kernel_size (tuple): Size of the average pooling kernel.
68
+ n_blocks (int): Number of convolutional blocks in the block.
69
+ momentum (float): Momentum for batch normalization.
70
+
71
+ """
72
+
73
+ def __init__(
74
+ self,
75
+ in_channels,
76
+ out_channels,
77
+ kernel_size,
78
+ n_blocks=1,
79
+ momentum=0.01,
80
+ ):
81
+ super().__init__()
82
+ self.n_blocks = n_blocks
83
+ self.conv = nn.ModuleList()
84
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
85
+ for _ in range(n_blocks - 1):
86
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
87
+ self.kernel_size = kernel_size
88
+ if self.kernel_size is not None:
89
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
90
+
91
+ def forward(self, x):
92
+ for i in range(self.n_blocks):
93
+ x = self.conv[i](x)
94
+ if self.kernel_size is not None:
95
+ return x, self.pool(x)
96
+ return x
97
+
98
+
99
+ class Encoder(nn.Module):
100
+ """
101
+ The encoder part of the DeepUnet.
102
+
103
+ Args:
104
+ in_channels (int): Number of input channels.
105
+ in_size (int): Size of the input tensor.
106
+ n_encoders (int): Number of encoder blocks.
107
+ kernel_size (tuple): Size of the average pooling kernel.
108
+ n_blocks (int): Number of convolutional blocks in each encoder block.
109
+ out_channels (int): Number of output channels for the first encoder block.
110
+ momentum (float): Momentum for batch normalization.
111
+
112
+ """
113
+
114
+ def __init__(
115
+ self,
116
+ in_channels,
117
+ in_size,
118
+ n_encoders,
119
+ kernel_size,
120
+ n_blocks,
121
+ out_channels=16,
122
+ momentum=0.01,
123
+ ):
124
+ super().__init__()
125
+ self.n_encoders = n_encoders
126
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
127
+ self.layers = nn.ModuleList()
128
+ self.latent_channels = []
129
+ for i in range(self.n_encoders):
130
+ self.layers.append(
131
+ ResEncoderBlock(
132
+ in_channels,
133
+ out_channels,
134
+ kernel_size,
135
+ n_blocks,
136
+ momentum=momentum,
137
+ ),
138
+ )
139
+ self.latent_channels.append([out_channels, in_size])
140
+ in_channels = out_channels
141
+ out_channels *= 2
142
+ in_size //= 2
143
+ self.out_size = in_size
144
+ self.out_channel = out_channels
145
+
146
+ def forward(self, x: torch.Tensor):
147
+ concat_tensors: list[torch.Tensor] = []
148
+ x = self.bn(x)
149
+ for i in range(self.n_encoders):
150
+ t, x = self.layers[i](x)
151
+ concat_tensors.append(t)
152
+ return x, concat_tensors
153
+
154
+
155
+ class Intermediate(nn.Module):
156
+ """
157
+ The intermediate layer of the DeepUnet.
158
+
159
+ Args:
160
+ in_channels (int): Number of input channels.
161
+ out_channels (int): Number of output channels.
162
+ n_inters (int): Number of convolutional blocks in the intermediate layer.
163
+ n_blocks (int): Number of convolutional blocks in each intermediate block.
164
+ momentum (float): Momentum for batch normalization.
165
+
166
+ """
167
+
168
+ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
169
+ super().__init__()
170
+ self.n_inters = n_inters
171
+ self.layers = nn.ModuleList()
172
+ self.layers.append(
173
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum),
174
+ )
175
+ for _ in range(self.n_inters - 1):
176
+ self.layers.append(
177
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum),
178
+ )
179
+
180
+ def forward(self, x):
181
+ for i in range(self.n_inters):
182
+ x = self.layers[i](x)
183
+ return x
184
+
185
+
186
+ class ResDecoderBlock(nn.Module):
187
+ """
188
+ A residual decoder block.
189
+
190
+ Args:
191
+ in_channels (int): Number of input channels.
192
+ out_channels (int): Number of output channels.
193
+ stride (tuple): Stride for transposed convolution.
194
+ n_blocks (int): Number of convolutional blocks in the block.
195
+ momentum (float): Momentum for batch normalization.
196
+
197
+ """
198
+
199
+ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
200
+ super().__init__()
201
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
202
+ self.n_blocks = n_blocks
203
+ self.conv1 = nn.Sequential(
204
+ nn.ConvTranspose2d(
205
+ in_channels=in_channels,
206
+ out_channels=out_channels,
207
+ kernel_size=(3, 3),
208
+ stride=stride,
209
+ padding=(1, 1),
210
+ output_padding=out_padding,
211
+ bias=False,
212
+ ),
213
+ nn.BatchNorm2d(out_channels, momentum=momentum),
214
+ nn.ReLU(),
215
+ )
216
+ self.conv2 = nn.ModuleList()
217
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
218
+ for _ in range(n_blocks - 1):
219
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
220
+
221
+ def forward(self, x, concat_tensor):
222
+ x = self.conv1(x)
223
+ x = torch.cat((x, concat_tensor), dim=1)
224
+ for i in range(self.n_blocks):
225
+ x = self.conv2[i](x)
226
+ return x
227
+
228
+
229
+ class Decoder(nn.Module):
230
+ """
231
+ The decoder part of the DeepUnet.
232
+
233
+ Args:
234
+ in_channels (int): Number of input channels.
235
+ n_decoders (int): Number of decoder blocks.
236
+ stride (tuple): Stride for transposed convolution.
237
+ n_blocks (int): Number of convolutional blocks in each decoder block.
238
+ momentum (float): Momentum for batch normalization.
239
+
240
+ """
241
+
242
+ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
243
+ super().__init__()
244
+ self.layers = nn.ModuleList()
245
+ self.n_decoders = n_decoders
246
+ for _ in range(self.n_decoders):
247
+ out_channels = in_channels // 2
248
+ self.layers.append(
249
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum),
250
+ )
251
+ in_channels = out_channels
252
+
253
+ def forward(self, x, concat_tensors):
254
+ for i in range(self.n_decoders):
255
+ x = self.layers[i](x, concat_tensors[-1 - i])
256
+ return x
257
+
258
+
259
+ class DeepUnet(nn.Module):
260
+ """
261
+ The DeepUnet architecture.
262
+
263
+ Args:
264
+ kernel_size (tuple): Size of the average pooling kernel.
265
+ n_blocks (int): Number of convolutional blocks in each encoder/decoder block.
266
+ en_de_layers (int): Number of encoder/decoder layers.
267
+ inter_layers (int): Number of convolutional blocks in the intermediate layer.
268
+ in_channels (int): Number of input channels.
269
+ en_out_channels (int): Number of output channels for the first encoder block.
270
+
271
+ """
272
+
273
+ def __init__(
274
+ self,
275
+ kernel_size,
276
+ n_blocks,
277
+ en_de_layers=5,
278
+ inter_layers=4,
279
+ in_channels=1,
280
+ en_out_channels=16,
281
+ ):
282
+ super().__init__()
283
+ self.encoder = Encoder(
284
+ in_channels,
285
+ 128,
286
+ en_de_layers,
287
+ kernel_size,
288
+ n_blocks,
289
+ en_out_channels,
290
+ )
291
+ self.intermediate = Intermediate(
292
+ self.encoder.out_channel // 2,
293
+ self.encoder.out_channel,
294
+ inter_layers,
295
+ n_blocks,
296
+ )
297
+ self.decoder = Decoder(
298
+ self.encoder.out_channel,
299
+ en_de_layers,
300
+ kernel_size,
301
+ n_blocks,
302
+ )
303
+
304
+ def forward(self, x):
305
+ x, concat_tensors = self.encoder(x)
306
+ x = self.intermediate(x)
307
+ x = self.decoder(x, concat_tensors)
308
+ return x
309
+
310
+
311
+ class E2E(nn.Module):
312
+ """
313
+ The end-to-end model.
314
+
315
+ Args:
316
+ n_blocks (int): Number of convolutional blocks in each encoder/decoder block.
317
+ n_gru (int): Number of GRU layers.
318
+ kernel_size (tuple): Size of the average pooling kernel.
319
+ en_de_layers (int): Number of encoder/decoder layers.
320
+ inter_layers (int): Number of convolutional blocks in the intermediate layer.
321
+ in_channels (int): Number of input channels.
322
+ en_out_channels (int): Number of output channels for the first encoder block.
323
+
324
+ """
325
+
326
+ def __init__(
327
+ self,
328
+ n_blocks,
329
+ n_gru,
330
+ kernel_size,
331
+ en_de_layers=5,
332
+ inter_layers=4,
333
+ in_channels=1,
334
+ en_out_channels=16,
335
+ ):
336
+ super().__init__()
337
+ self.unet = DeepUnet(
338
+ kernel_size,
339
+ n_blocks,
340
+ en_de_layers,
341
+ inter_layers,
342
+ in_channels,
343
+ en_out_channels,
344
+ )
345
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
346
+ if n_gru:
347
+ self.fc = nn.Sequential(
348
+ BiGRU(3 * 128, 256, n_gru),
349
+ nn.Linear(512, N_CLASS),
350
+ nn.Dropout(0.25),
351
+ nn.Sigmoid(),
352
+ )
353
+ else:
354
+ self.fc = nn.Sequential(
355
+ nn.Linear(3 * N_MELS, N_CLASS),
356
+ nn.Dropout(0.25),
357
+ nn.Sigmoid(),
358
+ )
359
+
360
+ def forward(self, mel):
361
+ mel = mel.transpose(-1, -2).unsqueeze(1)
362
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
363
+ x = self.fc(x)
364
+ return x
365
+
366
+
367
+ class MelSpectrogram(torch.nn.Module):
368
+ """
369
+ Extracts Mel-spectrogram features from audio.
370
+
371
+ Args:
372
+ n_mel_channels (int): Number of Mel-frequency bands.
373
+ sample_rate (int): Sampling rate of the audio.
374
+ win_length (int): Length of the window function in samples.
375
+ hop_length (int): Hop size between frames in samples.
376
+ n_fft (int, optional): Length of the FFT window. Defaults to None, which uses win_length.
377
+ mel_fmin (int, optional): Minimum frequency for the Mel filter bank. Defaults to 0.
378
+ mel_fmax (int, optional): Maximum frequency for the Mel filter bank. Defaults to None.
379
+ clamp (float, optional): Minimum value for clamping the Mel-spectrogram. Defaults to 1e-5.
380
+
381
+ """
382
+
383
+ def __init__(
384
+ self,
385
+ n_mel_channels,
386
+ sample_rate,
387
+ win_length,
388
+ hop_length,
389
+ n_fft=None,
390
+ mel_fmin=0,
391
+ mel_fmax=None,
392
+ clamp=1e-5,
393
+ ):
394
+ super().__init__()
395
+ n_fft = win_length if n_fft is None else n_fft
396
+ self.hann_window = {}
397
+ mel_basis = mel(
398
+ sr=sample_rate,
399
+ n_fft=n_fft,
400
+ n_mels=n_mel_channels,
401
+ fmin=mel_fmin,
402
+ fmax=mel_fmax,
403
+ htk=True,
404
+ )
405
+ mel_basis = torch.from_numpy(mel_basis).float()
406
+ self.register_buffer("mel_basis", mel_basis)
407
+ self.n_fft = win_length if n_fft is None else n_fft
408
+ self.hop_length = hop_length
409
+ self.win_length = win_length
410
+ self.sample_rate = sample_rate
411
+ self.n_mel_channels = n_mel_channels
412
+ self.clamp = clamp
413
+
414
+ def forward(self, audio, keyshift=0, speed=1, center=True):
415
+ factor = 2 ** (keyshift / 12)
416
+ n_fft_new = int(np.round(self.n_fft * factor))
417
+ win_length_new = int(np.round(self.win_length * factor))
418
+ hop_length_new = int(np.round(self.hop_length * speed))
419
+ keyshift_key = str(keyshift) + "_" + str(audio.device)
420
+ if keyshift_key not in self.hann_window:
421
+ self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
422
+ audio.device,
423
+ )
424
+ fft = torch.stft(
425
+ audio,
426
+ n_fft=n_fft_new,
427
+ hop_length=hop_length_new,
428
+ win_length=win_length_new,
429
+ window=self.hann_window[keyshift_key],
430
+ center=center,
431
+ return_complex=True,
432
+ )
433
+
434
+ magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
435
+ if keyshift != 0:
436
+ size = self.n_fft // 2 + 1
437
+ resize = magnitude.size(1)
438
+ if resize < size:
439
+ magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
440
+ magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
441
+ mel_output = torch.matmul(self.mel_basis, magnitude)
442
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
443
+ return log_mel_spec
444
+
445
+
446
+ class RMVPE0Predictor:
447
+ """
448
+ A predictor for fundamental frequency (F0) based on the RMVPE0 model.
449
+
450
+ Args:
451
+ model_path (str): Path to the RMVPE0 model file.
452
+ device (str, optional): Device to use for computation. Defaults to None, which uses CUDA if available.
453
+
454
+ """
455
+
456
+ def __init__(self, model_path, device=None):
457
+ self.resample_kernel = {}
458
+ model = E2E(4, 1, (2, 2))
459
+ ckpt = torch.load(model_path, map_location="cpu", weights_only=False)
460
+ model.load_state_dict(ckpt)
461
+ model.eval()
462
+ self.model = model
463
+ self.resample_kernel = {}
464
+ self.device = device
465
+ self.mel_extractor = MelSpectrogram(
466
+ N_MELS,
467
+ 16000,
468
+ 1024,
469
+ 160,
470
+ None,
471
+ 30,
472
+ 8000,
473
+ ).to(device)
474
+ self.model = self.model.to(device)
475
+ cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191
476
+ self.cents_mapping = np.pad(cents_mapping, (4, 4))
477
+
478
+ def mel2hidden(self, mel, chunk_size=32000):
479
+ """
480
+ Converts Mel-spectrogram features to hidden representation.
481
+
482
+ Args:
483
+ mel (torch.Tensor): Mel-spectrogram features.
484
+
485
+ """
486
+ with torch.no_grad():
487
+ n_frames = mel.shape[-1]
488
+ # print('n_frames', n_frames)
489
+ # print('mel shape before padding', mel.shape)
490
+ mel = F.pad(
491
+ mel,
492
+ (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames),
493
+ mode="reflect",
494
+ )
495
+ # print('mel shape after padding', mel.shape)
496
+
497
+ output_chunks = []
498
+ pad_frames = mel.shape[-1]
499
+ for start in range(0, pad_frames, chunk_size):
500
+ # print('chunk @', start)
501
+ end = min(start + chunk_size, pad_frames)
502
+ mel_chunk = mel[..., start:end]
503
+ assert (
504
+ mel_chunk.shape[-1] % 32 == 0
505
+ ), "chunk_size must be divisible by 32"
506
+ # print(' before padding', mel_chunk.shape)
507
+ # mel_chunk = F.pad(mel_chunk, (320, 320), mode="reflect")
508
+ # print(' after padding', mel_chunk.shape)
509
+
510
+ out_chunk = self.model(mel_chunk)
511
+ # print(' result chunk', out_chunk.shape)
512
+ # out_chunk = out_chunk[:, 320:-320, :]
513
+ # print(' trimmed chunk', out_chunk.shape)
514
+ output_chunks.append(out_chunk)
515
+
516
+ hidden = torch.cat(output_chunks, dim=1)
517
+ # print('output', hidden[:, :n_frames].shape)
518
+ return hidden[:, :n_frames]
519
+
520
+ def decode(self, hidden, thred=0.03):
521
+ """
522
+ Decodes hidden representation to F0.
523
+
524
+ Args:
525
+ hidden (np.ndarray): Hidden representation.
526
+ thred (float, optional): Threshold for salience. Defaults to 0.03.
527
+
528
+ """
529
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
530
+ f0 = 10 * (2 ** (cents_pred / 1200))
531
+ f0[f0 == 10] = 0
532
+ return f0
533
+
534
+ def infer_from_audio(self, audio, thred=0.03):
535
+ """
536
+ Infers F0 from audio.
537
+
538
+ Args:
539
+ audio (np.ndarray): Audio signal.
540
+ thred (float, optional): Threshold for salience. Defaults to 0.03.
541
+
542
+ """
543
+ audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
544
+ mel = self.mel_extractor(audio, center=True)
545
+ del audio
546
+ with torch.no_grad():
547
+ torch.cuda.empty_cache()
548
+ hidden = self.mel2hidden(mel)
549
+ hidden = hidden.squeeze(0).cpu().numpy()
550
+ f0 = self.decode(hidden, thred=thred)
551
+ return f0
552
+
553
+ def to_local_average_cents(self, salience, thred=0.05):
554
+ """
555
+ Converts salience to local average cents.
556
+
557
+ Args:
558
+ salience (np.ndarray): Salience values.
559
+ thred (float, optional): Threshold for salience. Defaults to 0.05.
560
+
561
+ """
562
+ center = np.argmax(salience, axis=1)
563
+ salience = np.pad(salience, ((0, 0), (4, 4)))
564
+ center += 4
565
+ todo_salience = []
566
+ todo_cents_mapping = []
567
+ starts = center - 4
568
+ ends = center + 5
569
+ for idx in range(salience.shape[0]):
570
+ todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
571
+ todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
572
+ todo_salience = np.array(todo_salience)
573
+ todo_cents_mapping = np.array(todo_cents_mapping)
574
+ product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
575
+ weight_sum = np.sum(todo_salience, 1)
576
+ devided = product_sum / weight_sum
577
+ maxx = np.max(salience, axis=1)
578
+ devided[maxx <= thred] = 0
579
+ return devided
580
+
581
+
582
+ class BiGRU(nn.Module):
583
+ """
584
+ A bidirectional GRU layer.
585
+
586
+ Args:
587
+ input_features (int): Number of input features.
588
+ hidden_features (int): Number of hidden features.
589
+ num_layers (int): Number of GRU layers.
590
+
591
+ """
592
+
593
+ def __init__(self, input_features, hidden_features, num_layers):
594
+ super().__init__()
595
+ self.gru = nn.GRU(
596
+ input_features,
597
+ hidden_features,
598
+ num_layers=num_layers,
599
+ batch_first=True,
600
+ bidirectional=True,
601
+ )
602
+
603
+ def forward(self, x):
604
+ return self.gru(x)[0]
rvc_logic/rvc/lib/predictors/f0.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from torchfcpe import spawn_infer_model_from_pt
4
+
5
+ import torch
6
+ import torchcrepe
7
+
8
+ from rvc_logic.common import RVC_MODELS_DIR
9
+ from rvc_logic.rvc.lib.predictors.RMVPE import RMVPE0Predictor
10
+
11
+
12
+ class RMVPE:
13
+ def __init__(self, device, model_name="rmvpe.pt", sample_rate=16000, hop_size=160):
14
+ self.device = device
15
+ self.sample_rate = sample_rate
16
+ self.hop_size = hop_size
17
+ self.model = RMVPE0Predictor(
18
+ os.path.join(RVC_MODELS_DIR, "predictors", model_name),
19
+ device=self.device,
20
+ )
21
+
22
+ def get_f0(self, x, filter_radius=0.03):
23
+ f0 = self.model.infer_from_audio(x, thred=filter_radius)
24
+ return f0
25
+
26
+
27
+ class CREPE:
28
+ def __init__(self, device, sample_rate=16000, hop_size=160):
29
+ self.device = device
30
+ self.sample_rate = sample_rate
31
+ self.hop_size = hop_size
32
+
33
+ def get_f0(self, x, f0_min=50, f0_max=1100, p_len=None, model="full"):
34
+ if p_len is None: # TODO p_len unused
35
+ p_len = x.shape[0] // self.hop_size
36
+
37
+ if not torch.is_tensor(x):
38
+ x = torch.from_numpy(x)
39
+
40
+ batch_size = 512
41
+
42
+ f0, pd = torchcrepe.predict(
43
+ x.float().to(self.device).unsqueeze(dim=0),
44
+ self.sample_rate,
45
+ self.hop_size,
46
+ f0_min,
47
+ f0_max,
48
+ model=model,
49
+ batch_size=batch_size,
50
+ device=self.device,
51
+ return_periodicity=True,
52
+ )
53
+ pd = torchcrepe.filter.median(pd, 3)
54
+ f0 = torchcrepe.filter.mean(f0, 3)
55
+ f0[pd < 0.1] = 0
56
+ f0 = f0[0].cpu().numpy()
57
+
58
+ return f0
59
+
60
+
61
+ class FCPE:
62
+ def __init__(self, device, sample_rate=16000, hop_size=160):
63
+ self.device = device
64
+ self.sample_rate = sample_rate
65
+ self.hop_size = hop_size
66
+ self.model = spawn_infer_model_from_pt(
67
+ os.path.join(RVC_MODELS_DIR, "predictors", "fcpe.pt"),
68
+ self.device,
69
+ bundled_model=True,
70
+ )
71
+
72
+ def get_f0(self, x, p_len=None, filter_radius=0.006):
73
+ if p_len is None: # TODO p_len unused
74
+ p_len = x.shape[0] // self.hop_size
75
+
76
+ if not torch.is_tensor(x):
77
+ x = torch.from_numpy(x)
78
+
79
+ f0 = (
80
+ self.model.infer(
81
+ x.float().to(self.device).unsqueeze(0),
82
+ sr=self.sample_rate,
83
+ decoder_mode="local_argmax",
84
+ threshold=filter_radius,
85
+ )
86
+ .squeeze()
87
+ .cpu()
88
+ .numpy()
89
+ )
90
+
91
+ return f0
92
+ n f0
rvc_logic/rvc/lib/tools/analyzer.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import numpy as np
3
+
4
+ import librosa
5
+ import librosa.display
6
+
7
+
8
+ def calculate_features(y, sr):
9
+ stft = np.abs(librosa.stft(y))
10
+ duration = librosa.get_duration(y=y, sr=sr)
11
+ cent = librosa.feature.spectral_centroid(S=stft, sr=sr)[0]
12
+ bw = librosa.feature.spectral_bandwidth(S=stft, sr=sr)[0]
13
+ rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sr)[0]
14
+ return stft, duration, cent, bw, rolloff
15
+
16
+
17
+ def plot_title(title):
18
+ plt.suptitle(title, fontsize=16, fontweight="bold")
19
+
20
+
21
+ def plot_spectrogram(y, sr, stft, duration, cmap="inferno"):
22
+ plt.subplot(3, 1, 1)
23
+ plt.imshow(
24
+ librosa.amplitude_to_db(stft, ref=np.max),
25
+ origin="lower",
26
+ extent=[0, duration, 0, sr / 1000],
27
+ aspect="auto",
28
+ cmap=cmap, # Change the colormap here
29
+ )
30
+ plt.colorbar(format="%+2.0f dB")
31
+ plt.xlabel("Time (s)")
32
+ plt.ylabel("Frequency (kHz)")
33
+ plt.title("Spectrogram")
34
+
35
+
36
+ def plot_waveform(y, sr, duration):
37
+ plt.subplot(3, 1, 2)
38
+ librosa.display.waveshow(y, sr=sr)
39
+ plt.xlabel("Time (s)")
40
+ plt.ylabel("Amplitude")
41
+ plt.title("Waveform")
42
+
43
+
44
+ def plot_features(times, cent, bw, rolloff, duration):
45
+ plt.subplot(3, 1, 3)
46
+ plt.plot(times, cent, label="Spectral Centroid (kHz)", color="b")
47
+ plt.plot(times, bw, label="Spectral Bandwidth (kHz)", color="g")
48
+ plt.plot(times, rolloff, label="Spectral Rolloff (kHz)", color="r")
49
+ plt.xlabel("Time (s)")
50
+ plt.title("Spectral Features")
51
+ plt.legend()
52
+
53
+
54
+ def analyze_audio(audio_file, save_plot_path="logs/audio_analysis.png"):
55
+ y, sr = librosa.load(audio_file)
56
+ stft, duration, cent, bw, rolloff = calculate_features(y, sr)
57
+
58
+ plt.figure(figsize=(12, 10))
59
+
60
+ plot_title("Audio Analysis" + " - " + audio_file.split("/")[-1])
61
+ plot_spectrogram(y, sr, stft, duration)
62
+ plot_waveform(y, sr, duration)
63
+ plot_features(librosa.times_like(cent), cent, bw, rolloff, duration)
64
+
65
+ plt.tight_layout()
66
+
67
+ if save_plot_path:
68
+ plt.savefig(save_plot_path, bbox_inches="tight", dpi=300)
69
+ plt.close()
70
+
71
+ audio_info = f"""Sample Rate: {sr}\nDuration: {(
72
+ str(round(duration, 2)) + " seconds"
73
+ if duration < 60
74
+ else str(round(duration / 60, 2)) + " minutes"
75
+ )}\nNumber of Samples: {len(y)}\nBits per Sample: {librosa.get_samplerate(audio_file)}\nChannels: {"Mono (1)" if y.ndim == 1 else "Stereo (2)"}"""
76
+
77
+ return audio_info, save_plot_path
rvc_logic/rvc/lib/tools/gdown.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import IO
2
+
3
+ import json
4
+ import os
5
+ import pathlib
6
+ import re
7
+ import shutil
8
+ import sys
9
+ import tempfile
10
+ import time
11
+ import warnings
12
+ from urllib.parse import unquote, urlparse
13
+
14
+ import requests
15
+
16
+ from tqdm import tqdm
17
+
18
+ CHUNK_SIZE = 512 * 1024
19
+ HOME = os.path.expanduser("~")
20
+
21
+
22
+ def indent(text: str, prefix: str):
23
+ """Indent each non-empty line of text with the given prefix."""
24
+ return "".join(
25
+ (prefix + line if line.strip() else line) for line in text.splitlines(True)
26
+ )
27
+
28
+
29
+ class FileURLRetrievalError(Exception):
30
+ """Custom exception for issues retrieving file URLs."""
31
+
32
+
33
+ def _extract_download_url_from_confirmation(contents: str, url_origin: str):
34
+ """Extract the download URL from a Google Drive confirmation page."""
35
+ patterns = [
36
+ r'href="(\/uc\?export=download[^"]+)',
37
+ r'href="/open\?id=([^"]+)"',
38
+ r'"downloadUrl":"([^"]+)',
39
+ ]
40
+ for pattern in patterns:
41
+ match = re.search(pattern, contents)
42
+ if match:
43
+ url = match.group(1)
44
+ if pattern == r'href="/open\?id=([^"]+)"':
45
+ uuid_match = re.search(
46
+ r'<input\s+type="hidden"\s+name="uuid"\s+value="([^"]+)"',
47
+ contents,
48
+ )
49
+ if uuid_match:
50
+ uuid = uuid_match.group(1)
51
+ return (
52
+ "https://drive.usercontent.google.com/download?id="
53
+ + url
54
+ + "&confirm=t&uuid="
55
+ + uuid
56
+ )
57
+ raise FileURLRetrievalError(
58
+ f"Could not find UUID for download from {url_origin}",
59
+ )
60
+ if pattern == r'"downloadUrl":"([^"]+)':
61
+ return url.replace("\\u003d", "=").replace("\\u0026", "&")
62
+ return "https://docs.google.com" + url.replace("&", "&")
63
+
64
+ error_match = re.search(r'<p class="uc-error-subcaption">(.*)</p>', contents)
65
+ if error_match:
66
+ error = error_match.group(1)
67
+ raise FileURLRetrievalError(error)
68
+
69
+ raise FileURLRetrievalError(
70
+ "Cannot retrieve the public link of the file. "
71
+ "You may need to change the permission to "
72
+ "'Anyone with the link', or have had many accesses.",
73
+ )
74
+
75
+
76
+ def _create_session(
77
+ proxy: str | None = None,
78
+ use_cookies: bool = True,
79
+ return_cookies_file: bool = False,
80
+ ):
81
+ """Create a requests session with optional proxy and cookie handling."""
82
+ sess = requests.session()
83
+ sess.headers.update(
84
+ {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"},
85
+ )
86
+
87
+ if proxy:
88
+ sess.proxies = {"http": proxy, "https": proxy}
89
+
90
+ cookies_file = os.path.join(HOME, ".cache/gdown/cookies.json")
91
+ if pathlib.Path(cookies_file).exists() and use_cookies:
92
+ try:
93
+ with pathlib.Path(cookies_file).open() as f:
94
+ cookies = json.load(f)
95
+ for k, v in cookies:
96
+ sess.cookies[k] = v
97
+ except json.JSONDecodeError:
98
+ warnings.warn("Corrupted Cookies file")
99
+
100
+ return (sess, cookies_file) if return_cookies_file else sess
101
+
102
+
103
+ def download(
104
+ output: str | None = None,
105
+ quiet: bool = False,
106
+ proxy: str | None = None,
107
+ speed: float | None = None,
108
+ use_cookies: bool = True,
109
+ verify: bool | str = True,
110
+ id: str | None = None,
111
+ fuzzy: bool = True,
112
+ resume: bool = False,
113
+ format: str | None = None,
114
+ url: str | None = None,
115
+ ):
116
+ """
117
+ Download a file from a URL, supporting Google Drive links.
118
+
119
+ Args:
120
+ output: Output filepath. Default is basename of URL.
121
+ quiet: Suppress terminal output.
122
+ proxy: HTTP/HTTPS proxy.
123
+ speed: Download speed limit (bytes per second).
124
+ use_cookies: Flag to use cookies.
125
+ verify: Verify TLS certificates.
126
+ id: Google Drive's file ID.
127
+ fuzzy: Fuzzy Google Drive ID extraction.
128
+ resume: Resume download from a tmp file.
129
+ format: Format for Google Docs/Sheets/Slides.
130
+ url: URL to download from.
131
+
132
+ Returns:
133
+ Output filename, or None on error.
134
+
135
+ """
136
+ if not (id is None) ^ (url is None):
137
+ raise ValueError("Either url or id has to be specified")
138
+
139
+ if id is not None:
140
+ url = f"https://drive.google.com/uc?id={id}"
141
+
142
+ url_origin = url
143
+ sess, cookies_file = _create_session(
144
+ proxy=proxy,
145
+ use_cookies=use_cookies,
146
+ return_cookies_file=True,
147
+ )
148
+
149
+ while True:
150
+ res = sess.get(url, stream=True, verify=verify)
151
+ res.raise_for_status()
152
+
153
+ if url == url_origin and res.status_code == 500:
154
+ url = f"https://drive.google.com/open?id={id}"
155
+ continue
156
+
157
+ if res.headers.get("Content-Type", "").startswith("text/html"):
158
+ title_match = re.search(r"<title>(.+)</title>", res.text)
159
+ if title_match:
160
+ title = title_match.group(1)
161
+ if title.endswith(" - Google Docs"):
162
+ url = (
163
+ f"https://docs.google.com/document/d/{id}/export?format={'docx' if format is None else format}"
164
+ )
165
+ continue
166
+ if title.endswith(" - Google Sheets"):
167
+ url = (
168
+ f"https://docs.google.com/spreadsheets/d/{id}/export?format={'xlsx' if format is None else format}"
169
+ )
170
+ continue
171
+ if title.endswith(" - Google Slides"):
172
+ url = (
173
+ f"https://docs.google.com/presentation/d/{id}/export?format={'pptx' if format is None else format}"
174
+ )
175
+ continue
176
+ if (
177
+ "Content-Disposition" in res.headers
178
+ and res.headers["Content-Disposition"].endswith("pptx")
179
+ and format not in (None, "pptx")
180
+ ):
181
+ url = (
182
+ f"https://docs.google.com/presentation/d/{id}/export?format={'pptx' if format is None else format}"
183
+ )
184
+ continue
185
+
186
+ if use_cookies:
187
+ pathlib.Path(os.path.dirname(cookies_file)).mkdir(
188
+ exist_ok=True, parents=True
189
+ )
190
+ cookies = [
191
+ (k, v)
192
+ for k, v in sess.cookies.items()
193
+ if not k.startswith("download_warning_")
194
+ ]
195
+ with pathlib.Path(cookies_file).open("w") as f:
196
+ json.dump(cookies, f, indent=2)
197
+
198
+ if "Content-Disposition" in res.headers:
199
+ break
200
+
201
+ parsed_url = urlparse(url)
202
+ is_gdrive = parsed_url.hostname in ("drive.google.com", "docs.google.com")
203
+ is_download_link = parsed_url.path.endswith("/uc")
204
+
205
+ if not (is_gdrive and is_download_link and fuzzy):
206
+ break
207
+
208
+ try:
209
+ url = _extract_download_url_from_confirmation(res.text, url_origin)
210
+ except FileURLRetrievalError as e:
211
+ raise FileURLRetrievalError(e)
212
+
213
+ content_disposition = res.headers.get("Content-Disposition", "")
214
+ filename_match = re.search(
215
+ r"filename\*=UTF-8''(.*)",
216
+ content_disposition,
217
+ ) or re.search(r'filename=["\']?(.*?)["\']?$', content_disposition)
218
+ filename_from_url = (
219
+ unquote(filename_match.group(1)) if filename_match else os.path.basename(url)
220
+ )
221
+ download_path = output or filename_from_url
222
+
223
+ if isinstance(download_path, str) and download_path.endswith(os.path.sep):
224
+ pathlib.Path(download_path).mkdir(exist_ok=True, parents=True)
225
+ download_path = os.path.join(download_path, filename_from_url)
226
+
227
+ temp_dir = os.path.dirname(download_path) or "."
228
+ prefix = os.path.basename(download_path)
229
+
230
+ if isinstance(download_path, str):
231
+ existing_tmp_files = [
232
+ os.path.join(temp_dir, file)
233
+ for file in os.listdir(temp_dir)
234
+ if file.startswith(prefix)
235
+ ]
236
+ if resume and existing_tmp_files:
237
+ if len(existing_tmp_files) > 1:
238
+ print(
239
+ "There are multiple temporary files to resume:",
240
+ file=sys.stderr,
241
+ )
242
+ for file in existing_tmp_files:
243
+ print(f"\t{file}", file=sys.stderr)
244
+ print(
245
+ "Please remove them except one to resume downloading.",
246
+ file=sys.stderr,
247
+ )
248
+ return None
249
+ temp_file_path = existing_tmp_files[0]
250
+ else:
251
+ resume = False
252
+ temp_file_path = tempfile.mktemp(
253
+ suffix=tempfile.template,
254
+ prefix=prefix,
255
+ dir=temp_dir,
256
+ )
257
+
258
+ try:
259
+ file_obj: IO = pathlib.Path(temp_file_path).open("ab")
260
+ except Exception as e:
261
+ print(
262
+ f"Could not open the temporary file {temp_file_path}: {e}",
263
+ file=sys.stderr,
264
+ )
265
+ return None
266
+ else:
267
+ temp_file_path = None
268
+ file_obj = download_path
269
+
270
+ if temp_file_path is not None and file_obj.tell() != 0:
271
+ headers = {"Range": f"bytes={file_obj.tell()}-"}
272
+ res = sess.get(url, headers=headers, stream=True, verify=verify)
273
+ res.raise_for_status()
274
+
275
+ try:
276
+ total = int(res.headers.get("Content-Length", 0))
277
+ if total > 0:
278
+ if not quiet:
279
+ pbar = tqdm(
280
+ total=total,
281
+ unit="B",
282
+ unit_scale=True,
283
+ desc=filename_from_url,
284
+ )
285
+ elif not quiet:
286
+ pbar = tqdm(unit="B", unit_scale=True, desc=filename_from_url)
287
+
288
+ t_start = time.time()
289
+ for chunk in res.iter_content(chunk_size=CHUNK_SIZE):
290
+ file_obj.write(chunk)
291
+ if not quiet:
292
+ pbar.update(len(chunk))
293
+ if speed is not None:
294
+ elapsed_time_expected = 1.0 * pbar.n / speed
295
+ elapsed_time = time.time() - t_start
296
+ if elapsed_time < elapsed_time_expected:
297
+ time.sleep(elapsed_time_expected - elapsed_time)
298
+ if not quiet:
299
+ pbar.close()
300
+
301
+ if temp_file_path:
302
+ file_obj.close()
303
+ shutil.move(temp_file_path, download_path)
304
+ finally:
305
+ sess.close()
306
+
307
+ return download_path
rvc_logic/rvc/lib/tools/launch_tensorboard.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+
4
+ from tensorboard import program
5
+
6
+ log_path = "logs"
7
+
8
+
9
+ def launch_tensorboard_pipeline():
10
+ logging.getLogger("root").setLevel(logging.WARNING)
11
+ logging.getLogger("tensorboard").setLevel(logging.WARNING)
12
+
13
+ tb = program.TensorBoard()
14
+ tb.configure(argv=[None, "--logdir", log_path])
15
+ url = tb.launch()
16
+
17
+ print(
18
+ "Access the tensorboard using the following"
19
+ f" link:\n{url}?pinnedCards=%5B%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fd%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fkl%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fmel%22%7D%5D",
20
+ )
21
+
22
+ while True:
23
+ time.sleep(600)
rvc_logic/rvc/lib/tools/model_download.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pathlib
3
+ import re
4
+ import shutil
5
+ import sys
6
+ import zipfile
7
+ from urllib.parse import unquote
8
+
9
+ from bs4 import BeautifulSoup
10
+
11
+ import requests
12
+
13
+ from tqdm import tqdm
14
+
15
+ now_dir = pathlib.Path.cwd()
16
+ sys.path.append(str(now_dir))
17
+
18
+ from rvc_logic.rvc.lib.tools import gdown
19
+ from rvc_logic.rvc.lib.utils import format_title
20
+
21
+ file_path = os.path.join(now_dir, "logs")
22
+ zips_path = os.path.join(file_path, "zips")
23
+ pathlib.Path(zips_path).mkdir(exist_ok=True, parents=True)
24
+
25
+
26
+ def search_pth_index(folder):
27
+ pth_paths = [
28
+ os.path.join(folder, file)
29
+ for file in os.listdir(folder)
30
+ if pathlib.Path(os.path.join(folder, file)).is_file() and file.endswith(".pth")
31
+ ]
32
+ index_paths = [
33
+ os.path.join(folder, file)
34
+ for file in os.listdir(folder)
35
+ if pathlib.Path(os.path.join(folder, file)).is_file()
36
+ and file.endswith(".index")
37
+ ]
38
+ return pth_paths, index_paths
39
+
40
+
41
+ def download_from_url(url):
42
+ os.chdir(zips_path)
43
+
44
+ try:
45
+ if "drive.google.com" in url:
46
+ file_id = extract_google_drive_id(url)
47
+ if file_id:
48
+ gdown.download(
49
+ url=f"https://drive.google.com/uc?id={file_id}",
50
+ quiet=False,
51
+ fuzzy=True,
52
+ )
53
+ elif "/blob/" in url or "/resolve/" in url:
54
+ download_blob_or_resolve(url)
55
+ elif "/tree/main" in url:
56
+ download_from_huggingface(url)
57
+ else:
58
+ download_file(url)
59
+
60
+ rename_downloaded_files()
61
+ return "downloaded"
62
+ except Exception as error:
63
+ print(f"An error occurred downloading the file: {error}")
64
+ return None
65
+ finally:
66
+ os.chdir(now_dir)
67
+
68
+
69
+ def extract_google_drive_id(url):
70
+ if "file/d/" in url:
71
+ return url.split("file/d/")[1].split("/")[0]
72
+ if "id=" in url:
73
+ return url.split("id=")[1].split("&")[0]
74
+ return None
75
+
76
+
77
+ def download_blob_or_resolve(url):
78
+ if "/blob/" in url:
79
+ url = url.replace("/blob/", "/resolve/")
80
+ response = requests.get(url, stream=True)
81
+ if response.status_code == 200:
82
+ save_response_content(response)
83
+ else:
84
+ raise ValueError(
85
+ "Download failed with status code: " + str(response.status_code),
86
+ )
87
+
88
+
89
+ def save_response_content(response):
90
+ content_disposition = unquote(response.headers.get("Content-Disposition", ""))
91
+ file_name = (
92
+ re.search(r'filename="([^"]+)"', content_disposition)
93
+ .groups()[0]
94
+ .replace(os.path.sep, "_")
95
+ if content_disposition
96
+ else "downloaded_file"
97
+ )
98
+
99
+ total_size = int(response.headers.get("Content-Length", 0))
100
+ chunk_size = 1024
101
+
102
+ with (
103
+ pathlib.Path(os.path.join(zips_path, file_name)).open("wb") as file,
104
+ tqdm(
105
+ total=total_size,
106
+ unit="B",
107
+ unit_scale=True,
108
+ desc=file_name,
109
+ ) as progress_bar,
110
+ ):
111
+ for data in response.iter_content(chunk_size):
112
+ file.write(data)
113
+ progress_bar.update(len(data))
114
+
115
+
116
+ def download_from_huggingface(url):
117
+ response = requests.get(url)
118
+ soup = BeautifulSoup(response.content, "html.parser")
119
+ temp_url = next(
120
+ (
121
+ link["href"]
122
+ for link in soup.find_all("a", href=True)
123
+ if link["href"].endswith(".zip")
124
+ ),
125
+ None,
126
+ )
127
+ if temp_url:
128
+ url = temp_url.replace("blob", "resolve")
129
+ if "huggingface.co" not in url:
130
+ url = "https://huggingface.co" + url
131
+ download_file(url)
132
+ else:
133
+ raise ValueError("No zip file found in Huggingface URL")
134
+
135
+
136
+ def download_file(url):
137
+ response = requests.get(url, stream=True)
138
+ if response.status_code == 200:
139
+ save_response_content(response)
140
+ else:
141
+ raise ValueError(
142
+ "Download failed with status code: " + str(response.status_code),
143
+ )
144
+
145
+
146
+ def rename_downloaded_files():
147
+ for currentPath, _, zipFiles in os.walk(zips_path):
148
+ for file in zipFiles:
149
+ file_name, extension = os.path.splitext(file)
150
+ real_path = os.path.join(currentPath, file)
151
+ pathlib.Path(real_path).rename(
152
+ file_name.replace(os.path.sep, "_") + extension
153
+ )
154
+
155
+
156
+ def extract(zipfile_path, unzips_path):
157
+ try:
158
+ with zipfile.ZipFile(zipfile_path, "r") as zip_ref:
159
+ zip_ref.extractall(unzips_path)
160
+ pathlib.Path(zipfile_path).unlink()
161
+ return True
162
+ except Exception as error:
163
+ print(f"An error occurred extracting the zip file: {error}")
164
+ return False
165
+
166
+
167
+ def unzip_file(zip_path, zip_file_name):
168
+ zip_file_path = os.path.join(zip_path, zip_file_name + ".zip")
169
+ extract_path = os.path.join(file_path, zip_file_name)
170
+ with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
171
+ zip_ref.extractall(extract_path)
172
+ pathlib.Path(zip_file_path).unlink()
173
+
174
+
175
+ def model_download_pipeline(url: str):
176
+ try:
177
+ result = download_from_url(url)
178
+ if result == "downloaded":
179
+ return handle_extraction_process()
180
+ return "Error"
181
+ except Exception as error:
182
+ print(f"An unexpected error occurred: {error}")
183
+ return "Error"
184
+
185
+
186
+ def handle_extraction_process():
187
+ extract_folder_path = ""
188
+ for filename in os.listdir(zips_path):
189
+ if filename.endswith(".zip"):
190
+ zipfile_path = os.path.join(zips_path, filename)
191
+ model_name = format_title(os.path.basename(zipfile_path).split(".zip")[0])
192
+ extract_folder_path = os.path.join("logs", os.path.normpath(model_name))
193
+ success = extract(zipfile_path, extract_folder_path)
194
+ clean_extracted_files(extract_folder_path, model_name)
195
+
196
+ if success:
197
+ print(f"Model {model_name} downloaded!")
198
+ else:
199
+ print(f"Error downloading {model_name}")
200
+ return "Error"
201
+ if not extract_folder_path:
202
+ print("Zip file was not found.")
203
+ return "Error"
204
+ return search_pth_index(extract_folder_path)
205
+
206
+
207
+ def clean_extracted_files(extract_folder_path, model_name):
208
+ macosx_path = os.path.join(extract_folder_path, "__MACOSX")
209
+ if pathlib.Path(macosx_path).exists():
210
+ shutil.rmtree(macosx_path)
211
+
212
+ subfolders = [
213
+ f
214
+ for f in os.listdir(extract_folder_path)
215
+ if pathlib.Path(os.path.join(extract_folder_path, f)).is_dir()
216
+ ]
217
+ if len(subfolders) == 1:
218
+ subfolder_path = os.path.join(extract_folder_path, subfolders[0])
219
+ for item in os.listdir(subfolder_path):
220
+ shutil.move(
221
+ os.path.join(subfolder_path, item),
222
+ os.path.join(extract_folder_path, item),
223
+ )
224
+ pathlib.Path(subfolder_path).rmdir()
225
+
226
+ for item in os.listdir(extract_folder_path):
227
+ source_path = os.path.join(extract_folder_path, item)
228
+ if ".pth" in item:
229
+ new_file_name = model_name + ".pth"
230
+ elif ".index" in item:
231
+ new_file_name = model_name + ".index"
232
+ else:
233
+ continue
234
+
235
+ destination_path = os.path.join(extract_folder_path, new_file_name)
236
+ if not pathlib.Path(destination_path).exists():
237
+ pathlib.Path(source_path).rename(destination_path)
238
+ ath)
rvc_logic/rvc/lib/tools/prerequisites_download.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING
2
+
3
+ import lazy_loader as lazy
4
+
5
+ import os
6
+ import pathlib
7
+ from concurrent.futures import ThreadPoolExecutor
8
+
9
+ from rvc_logic.common import (
10
+ EMBEDDER_MODELS_DIR,
11
+ PRETRAINED_MODELS_DIR,
12
+ RVC_MODELS_DIR,
13
+ )
14
+
15
+ if TYPE_CHECKING:
16
+ import requests
17
+
18
+ import tqdm
19
+
20
+ else:
21
+ tqdm = lazy.load("tqdm")
22
+ requests = lazy.load("requests")
23
+
24
+
25
+ url_base = "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources"
26
+
27
+ pretraineds_hifigan_list = [
28
+ (
29
+ "pretrained_v2/",
30
+ [
31
+ "f0D32k.pth",
32
+ "f0D40k.pth",
33
+ "f0D48k.pth",
34
+ "f0G32k.pth",
35
+ "f0G40k.pth",
36
+ "f0G48k.pth",
37
+ ],
38
+ ),
39
+ ]
40
+ pretraineds_refinegan_list = [("refinegan/", ["f0D32k.pth", "f0G32k.pth"])]
41
+ models_list = [("predictors/", ["rmvpe.pt", "fcpe.pt"])]
42
+ embedders_list = [
43
+ ("embedders/contentvec/", ["pytorch_model.bin", "config.json"]),
44
+ ("embedders/chinese_hubert_base/", ["pytorch_model.bin", "config.json"]),
45
+ ("embedders/japanese_hubert_base/", ["pytorch_model.bin", "config.json"]),
46
+ ("embedders/korean_hubert_base/", ["pytorch_model.bin", "config.json"]),
47
+ ("embedders/spin/", ["pytorch_model.bin", "config.json"]),
48
+ ("embedders/spin-v2/", ["pytorch_model.bin", "config.json"]),
49
+ ]
50
+ executables_list = [
51
+ ("", ["ffmpeg.exe", "ffprobe.exe"]),
52
+ ]
53
+
54
+ folder_mapping_list = {
55
+ "pretrained_v2/": str(PRETRAINED_MODELS_DIR / "hifi-gan/"),
56
+ "refinegan/": str(PRETRAINED_MODELS_DIR / "refinegan/"),
57
+ "embedders/contentvec/": str(EMBEDDER_MODELS_DIR / "contentvec/"),
58
+ "embedders/chinese_hubert_base/": str(
59
+ EMBEDDER_MODELS_DIR / "chinese_hubert_base/",
60
+ ),
61
+ "embedders/japanese_hubert_base/": str(
62
+ EMBEDDER_MODELS_DIR / "japanese_hubert_base/",
63
+ ),
64
+ "embedders/korean_hubert_base/": str(
65
+ EMBEDDER_MODELS_DIR / "korean_hubert_base/",
66
+ ),
67
+ "embedders/spin/": str(EMBEDDER_MODELS_DIR / "spin/"),
68
+ "embedders/spin-v2/": str(EMBEDDER_MODELS_DIR / "spin-v2/"),
69
+ "predictors/": str(RVC_MODELS_DIR / "predictors/"),
70
+ "formant/": str(RVC_MODELS_DIR / "formant/"),
71
+ }
72
+
73
+
74
+ def get_file_size_if_missing(file_list):
75
+ """
76
+ Calculate the total size of files to be downloaded only if they do not exist locally.
77
+ """
78
+ total_size = 0
79
+ for remote_folder, files in file_list:
80
+ local_folder = folder_mapping_list.get(remote_folder, "")
81
+ for file in files:
82
+ destination_path = os.path.join(local_folder, file)
83
+ if not pathlib.Path(destination_path).exists():
84
+ url = f"{url_base}/{remote_folder}{file}"
85
+ response = requests.head(url)
86
+ total_size += int(response.headers.get("content-length", 0))
87
+ return total_size
88
+
89
+
90
+ def download_file(url, destination_path, global_bar):
91
+ """
92
+ Download a file from the given URL to the specified destination path,
93
+ updating the global progress bar as data is downloaded.
94
+ """
95
+ dir_name = os.path.dirname(destination_path)
96
+ if dir_name:
97
+ pathlib.Path(dir_name).mkdir(exist_ok=True, parents=True)
98
+ response = requests.get(url, stream=True)
99
+ block_size = 1024
100
+ with pathlib.Path(destination_path).open("wb") as file:
101
+ for data in response.iter_content(block_size):
102
+ file.write(data)
103
+ global_bar.update(len(data))
104
+
105
+
106
+ def download_mapping_files(file_mapping_list, global_bar):
107
+ """
108
+ Download all files in the provided file mapping list using a thread pool executor,
109
+ and update the global progress bar as downloads progress.
110
+ """
111
+ with ThreadPoolExecutor() as executor:
112
+ futures = []
113
+ for remote_folder, file_list in file_mapping_list:
114
+ local_folder = folder_mapping_list.get(remote_folder, "")
115
+ for file in file_list:
116
+ destination_path = os.path.join(local_folder, file)
117
+ if not pathlib.Path(destination_path).exists():
118
+ url = f"{url_base}/{remote_folder}{file}"
119
+ futures.append(
120
+ executor.submit(
121
+ download_file,
122
+ url,
123
+ destination_path,
124
+ global_bar,
125
+ ),
126
+ )
127
+ for future in futures:
128
+ future.result()
129
+
130
+
131
+ def split_pretraineds(pretrained_list):
132
+ f0_list = []
133
+ non_f0_list = []
134
+ for folder, files in pretrained_list:
135
+ f0_files = [f for f in files if f.startswith("f0")]
136
+ non_f0_files = [f for f in files if not f.startswith("f0")]
137
+ if f0_files:
138
+ f0_list.append((folder, f0_files))
139
+ if non_f0_files:
140
+ non_f0_list.append((folder, non_f0_files))
141
+ return f0_list, non_f0_list
142
+
143
+
144
+ pretraineds_hifigan_list, _ = split_pretraineds(pretraineds_hifigan_list)
145
+
146
+
147
+ def calculate_total_size(
148
+ pretraineds_hifigan,
149
+ models,
150
+ exe,
151
+ ):
152
+ """
153
+ Calculate the total size of all files to be downloaded based on selected categories.
154
+ """
155
+ total_size = 0
156
+ if models:
157
+ total_size += get_file_size_if_missing(models_list)
158
+ total_size += get_file_size_if_missing(embedders_list)
159
+ if exe and os.name == "nt":
160
+ total_size += get_file_size_if_missing(executables_list)
161
+ total_size += get_file_size_if_missing(pretraineds_hifigan)
162
+ total_size += get_file_size_if_missing(pretraineds_refinegan_list)
163
+ return total_size
164
+
165
+
166
+ def prequisites_download_pipeline(
167
+ pretraineds_hifigan: bool = True,
168
+ models: bool = True,
169
+ exe: bool = True,
170
+ ) -> None:
171
+ """
172
+ Manage the download pipeline for different categories of files.
173
+ """
174
+ total_size = calculate_total_size(
175
+ pretraineds_hifigan_list if pretraineds_hifigan else [],
176
+ models,
177
+ exe,
178
+ )
179
+
180
+ if total_size > 0:
181
+ with tqdm.tqdm(
182
+ total=total_size,
183
+ unit="iB",
184
+ unit_scale=True,
185
+ desc="Downloading all files",
186
+ ) as global_bar:
187
+ if models:
188
+ download_mapping_files(models_list, global_bar)
189
+ download_mapping_files(embedders_list, global_bar)
190
+ if exe:
191
+ if os.name == "nt":
192
+ download_mapping_files(executables_list, global_bar)
193
+ else:
194
+ print("No executables needed")
195
+ if pretraineds_hifigan:
196
+ download_mapping_files(pretraineds_hifigan_list, global_bar)
197
+ download_mapping_files(pretraineds_refinegan_list, global_bar)
198
+ )
rvc_logic/rvc/lib/tools/pretrained_selector.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pathlib
3
+
4
+ from rvc_logic.common import PRETRAINED_MODELS_DIR
5
+
6
+
7
+ def pretrained_selector(vocoder: str, sample_rate: int) -> tuple[str, str]:
8
+ base_path = os.path.join(PRETRAINED_MODELS_DIR, f"{vocoder.lower()}")
9
+
10
+ path_g = os.path.join(base_path, f"f0G{str(sample_rate)[:2]}k.pth")
11
+ path_d = os.path.join(base_path, f"f0D{str(sample_rate)[:2]}k.pth")
12
+
13
+ if pathlib.Path(path_g).exists() and pathlib.Path(path_d).exists():
14
+ return path_g, path_d
15
+ return "", ""
16
+ "
rvc_logic/rvc/lib/tools/split_audio.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ import librosa
4
+
5
+
6
+ def process_audio(audio, sr=16000, silence_thresh=-60, min_silence_len=250):
7
+ """
8
+ Splits an audio signal into segments using a fixed frame size and hop size.
9
+
10
+ Parameters
11
+ ----------
12
+ - audio (np.ndarray): The audio signal to split.
13
+ - sr (int): The sample rate of the input audio (default is 16000).
14
+ - silence_thresh (int): Silence threshold (default =-60dB)
15
+ - min_silence_len (int): Minimum silence duration (default 250ms).
16
+
17
+ Returns
18
+ -------
19
+ - list of np.ndarray: A list of audio segments.
20
+ - np.ndarray: The intervals where the audio was split.
21
+
22
+ """
23
+ frame_length = int(min_silence_len / 1000 * sr)
24
+ hop_length = frame_length // 2
25
+ intervals = librosa.effects.split(
26
+ audio,
27
+ top_db=-silence_thresh,
28
+ frame_length=frame_length,
29
+ hop_length=hop_length,
30
+ )
31
+ audio_segments = [audio[start:end] for start, end in intervals]
32
+
33
+ return audio_segments, intervals
34
+
35
+
36
+ def merge_audio(audio_segments_org, audio_segments_new, intervals, sr_orig, sr_new):
37
+ """
38
+ Merges audio segments back into a single audio signal, filling gaps with silence.
39
+ Assumes audio segments are already at sr_new.
40
+
41
+ Parameters
42
+ ----------
43
+ - audio_segments_org (list of np.ndarray): The non-silent audio segments (at sr_orig).
44
+ - audio_segments_new (list of np.ndarray): The non-silent audio segments (at sr_new).
45
+ - intervals (np.ndarray): The intervals used for splitting the original audio.
46
+ - sr_orig (int): The sample rate of the original audio
47
+ - sr_new (int): The sample rate of the model
48
+ Returns:
49
+ - np.ndarray: The merged audio signal with silent gaps restored.
50
+
51
+ """
52
+ merged_audio = np.array([], dtype=audio_segments_new[0].dtype)
53
+ sr_ratio = sr_new / sr_orig
54
+
55
+ for i, (start, end) in enumerate(intervals):
56
+
57
+ start_new = int(start * sr_ratio)
58
+ end_new = int(end * sr_ratio)
59
+
60
+ original_duration = len(audio_segments_org[i]) / sr_orig
61
+ new_duration = len(audio_segments_new[i]) / sr_new
62
+ duration_diff = new_duration - original_duration
63
+
64
+ silence_samples = int(abs(duration_diff) * sr_new)
65
+ silence_compensation = np.zeros(
66
+ silence_samples,
67
+ dtype=audio_segments_new[0].dtype,
68
+ )
69
+
70
+ if i == 0 and start_new > 0:
71
+ initial_silence = np.zeros(start_new, dtype=audio_segments_new[0].dtype)
72
+ merged_audio = np.concatenate((merged_audio, initial_silence))
73
+
74
+ if duration_diff > 0:
75
+ merged_audio = np.concatenate((merged_audio, silence_compensation))
76
+
77
+ merged_audio = np.concatenate((merged_audio, audio_segments_new[i]))
78
+
79
+ if duration_diff < 0:
80
+ merged_audio = np.concatenate((merged_audio, silence_compensation))
81
+
82
+ if i < len(intervals) - 1:
83
+ next_start_new = int(intervals[i + 1][0] * sr_ratio)
84
+ silence_duration = next_start_new - end_new
85
+ if silence_duration > 0:
86
+ silence = np.zeros(silence_duration, dtype=audio_segments_new[0].dtype)
87
+ merged_audio = np.concatenate((merged_audio, silence))
88
+
89
+ return merged_audio
rvc_logic/rvc/lib/tools/tts.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import pathlib
3
+ import sys
4
+
5
+ import edge_tts
6
+
7
+
8
+ async def main():
9
+ # Parse command line arguments
10
+ tts_file = str(sys.argv[1])
11
+ text = str(sys.argv[2])
12
+ voice = str(sys.argv[3])
13
+ rate = int(sys.argv[4])
14
+ output_file = str(sys.argv[5])
15
+
16
+ rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
17
+ if tts_file and pathlib.Path(tts_file).exists():
18
+ text = ""
19
+ try:
20
+ with pathlib.Path(tts_file).open(encoding="utf-8") as file:
21
+ text = file.read()
22
+ except UnicodeDecodeError:
23
+ with pathlib.Path(tts_file).open() as file:
24
+ text = file.read()
25
+ await edge_tts.Communicate(text, voice, rate=rates).save(output_file)
26
+ # print(f"TTS with {voice} completed. Output TTS file: '{output_file}'")
27
+
28
+
29
+ if __name__ == "__main__":
30
+ asyncio.run(main())
rvc_logic/rvc/lib/tools/tts_voices.json ADDED
The diff for this file is too large to render. See raw diff
 
rvc_logic/rvc/lib/utils.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import pathlib
4
+ import re
5
+ import sys
6
+ import unicodedata
7
+ import warnings
8
+
9
+ import soxr
10
+
11
+ import wget
12
+
13
+ import numpy as np
14
+
15
+ from torch import nn
16
+ from transformers import HubertModel
17
+
18
+ import librosa
19
+ import soundfile as sf
20
+
21
+ from rvc_logic.common import RVC_MODELS_DIR
22
+
23
+ # Remove this to see warnings about transformers models
24
+ warnings.filterwarnings("ignore")
25
+
26
+ logging.getLogger("fairseq").setLevel(logging.ERROR)
27
+ logging.getLogger("faiss.loader").setLevel(logging.ERROR)
28
+ logging.getLogger("transformers").setLevel(logging.ERROR)
29
+ logging.getLogger("torch").setLevel(logging.ERROR)
30
+
31
+ now_dir = pathlib.Path.cwd()
32
+ sys.path.append(str(now_dir))
33
+
34
+ base_path = os.path.join(str(RVC_MODELS_DIR), "formant", "stftpitchshift")
35
+ stft = base_path + ".exe" if sys.platform == "win32" else base_path
36
+
37
+
38
+ class HubertModelWithFinalProj(HubertModel):
39
+ def __init__(self, config):
40
+ super().__init__(config)
41
+ self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
42
+
43
+
44
+ def load_audio_16k(file):
45
+ # this is used by f0 and feature extractions that load preprocessed 16k files, so there's no need to resample
46
+ try:
47
+ audio, sr = librosa.load(file, sr=16000)
48
+ except Exception as error:
49
+ raise RuntimeError(f"An error occurred loading the audio: {error}")
50
+
51
+ return audio.flatten()
52
+
53
+
54
+ def load_audio(file, sample_rate):
55
+ try:
56
+ file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
57
+ audio, sr = sf.read(file)
58
+ if len(audio.shape) > 1:
59
+ audio = librosa.to_mono(audio.T)
60
+ if sr != sample_rate:
61
+ audio = librosa.resample(
62
+ audio,
63
+ orig_sr=sr,
64
+ target_sr=sample_rate,
65
+ res_type="soxr_vhq",
66
+ )
67
+ except Exception as error:
68
+ raise RuntimeError(f"An error occurred loading the audio: {error}")
69
+
70
+ return audio.flatten()
71
+
72
+
73
+ def load_audio_infer(
74
+ file,
75
+ sample_rate,
76
+ **kwargs,
77
+ ):
78
+ formant_shifting = kwargs.get("formant_shifting", False)
79
+ try:
80
+ file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
81
+ if not pathlib.Path(file).is_file():
82
+ raise FileNotFoundError(f"File not found: {file}")
83
+ audio, sr = sf.read(file)
84
+ if len(audio.shape) > 1:
85
+ audio = librosa.to_mono(audio.T)
86
+ if sr != sample_rate:
87
+ audio = librosa.resample(
88
+ audio,
89
+ orig_sr=sr,
90
+ target_sr=sample_rate,
91
+ res_type="soxr_vhq",
92
+ )
93
+ if formant_shifting:
94
+ formant_qfrency = kwargs.get("formant_qfrency", 0.8)
95
+ formant_timbre = kwargs.get("formant_timbre", 0.8)
96
+
97
+ from stftpitchshift import StftPitchShift
98
+
99
+ pitchshifter = StftPitchShift(1024, 32, sample_rate)
100
+ audio = pitchshifter.shiftpitch(
101
+ audio,
102
+ factors=1,
103
+ quefrency=formant_qfrency * 1e-3,
104
+ distortion=formant_timbre,
105
+ )
106
+ except Exception as error:
107
+ raise RuntimeError(f"An error occurred loading the audio: {error}")
108
+ return np.array(audio).flatten()
109
+
110
+
111
+ def format_title(title):
112
+ formatted_title = unicodedata.normalize("NFC", title)
113
+ formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title)
114
+ formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title, flags=re.UNICODE)
115
+ formatted_title = re.sub(r"\s+", "_", formatted_title)
116
+ return formatted_title
117
+
118
+
119
+ def load_embedding(embedder_model, custom_embedder=None):
120
+ embedder_root = os.path.join(str(RVC_MODELS_DIR), "embedders")
121
+ embedding_list = {
122
+ "contentvec": os.path.join(embedder_root, "contentvec"),
123
+ "spin": os.path.join(embedder_root, "spin"),
124
+ "spin-v2": os.path.join(embedder_root, "spin-v2"),
125
+ "chinese-hubert-base": os.path.join(embedder_root, "chinese_hubert_base"),
126
+ "japanese-hubert-base": os.path.join(embedder_root, "japanese_hubert_base"),
127
+ "korean-hubert-base": os.path.join(embedder_root, "korean_hubert_base"),
128
+ }
129
+
130
+ online_embedders = {
131
+ "contentvec": (
132
+ "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/contentvec/pytorch_model.bin"
133
+ ),
134
+ "spin": (
135
+ "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/spin/pytorch_model.bin"
136
+ ),
137
+ "spin-v2": (
138
+ "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/spin-v2/pytorch_model.bin"
139
+ ),
140
+ "chinese-hubert-base": (
141
+ "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/chinese_hubert_base/pytorch_model.bin"
142
+ ),
143
+ "japanese-hubert-base": (
144
+ "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/japanese_hubert_base/pytorch_model.bin"
145
+ ),
146
+ "korean-hubert-base": (
147
+ "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/korean_hubert_base/pytorch_model.bin"
148
+ ),
149
+ }
150
+
151
+ config_files = {
152
+ "contentvec": (
153
+ "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/contentvec/config.json"
154
+ ),
155
+ "spin": (
156
+ "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/spin/config.json"
157
+ ),
158
+ "spin-v2": (
159
+ "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/spin-v2/config.json"
160
+ ),
161
+ "chinese-hubert-base": (
162
+ "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/chinese_hubert_base/config.json"
163
+ ),
164
+ "japanese-hubert-base": (
165
+ "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/japanese_hubert_base/config.json"
166
+ ),
167
+ "korean-hubert-base": (
168
+ "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/korean_hubert_base/config.json"
169
+ ),
170
+ }
171
+
172
+ if embedder_model == "custom":
173
+ if pathlib.Path(custom_embedder).exists():
174
+ model_path = custom_embedder
175
+ else:
176
+ print(f"Custom embedder not found: {custom_embedder}, using contentvec")
177
+ model_path = embedding_list["contentvec"]
178
+ else:
179
+ model_path = embedding_list[embedder_model]
180
+ bin_file = os.path.join(model_path, "pytorch_model.bin")
181
+ json_file = os.path.join(model_path, "config.json")
182
+ pathlib.Path(model_path).mkdir(exist_ok=True, parents=True)
183
+ if not pathlib.Path(bin_file).exists():
184
+ url = online_embedders[embedder_model]
185
+ print(f"Downloading {url} to {model_path}...")
186
+ wget.download(url, out=bin_file)
187
+ if not pathlib.Path(json_file).exists():
188
+ url = config_files[embedder_model]
189
+ print(f"Downloading {url} to {model_path}...")
190
+ wget.download(url, out=json_file)
191
+
192
+ models = HubertModelWithFinalProj.from_pretrained(model_path)
193
+ return models
194
+ s
rvc_logic/rvc/lib/zluda.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"):
4
+
5
+ class STFT:
6
+ def __init__(self):
7
+ self.device = "cuda"
8
+ self.fourier_bases = {} # Cache for Fourier bases
9
+
10
+ def _get_fourier_basis(self, n_fft):
11
+ # Check if the basis for this n_fft is already cached
12
+ if n_fft in self.fourier_bases:
13
+ return self.fourier_bases[n_fft]
14
+ fourier_basis = torch.fft.fft(torch.eye(n_fft, device="cpu")).to(
15
+ self.device,
16
+ )
17
+ # stack separated real and imaginary components and convert to torch tensor
18
+ cutoff = n_fft // 2 + 1
19
+ fourier_basis = torch.cat(
20
+ [fourier_basis.real[:cutoff], fourier_basis.imag[:cutoff]],
21
+ dim=0,
22
+ )
23
+ # cache the tensor and return
24
+ self.fourier_bases[n_fft] = fourier_basis
25
+ return fourier_basis
26
+
27
+ def transform(self, input, n_fft, hop_length, window):
28
+ # fetch cached Fourier basis
29
+ fourier_basis = self._get_fourier_basis(n_fft)
30
+ # apply hann window to Fourier basis
31
+ fourier_basis = fourier_basis * window
32
+ # pad input to center with reflect
33
+ pad_amount = n_fft // 2
34
+ input = torch.nn.functional.pad(
35
+ input,
36
+ (pad_amount, pad_amount),
37
+ mode="reflect",
38
+ )
39
+ # separate input into n_fft-sized frames
40
+ input_frames = input.unfold(1, n_fft, hop_length).permute(0, 2, 1)
41
+ # apply fft to each frame
42
+ fourier_transform = torch.matmul(fourier_basis, input_frames)
43
+ cutoff = n_fft // 2 + 1
44
+ return torch.complex(
45
+ fourier_transform[:, :cutoff, :],
46
+ fourier_transform[:, cutoff:, :],
47
+ )
48
+
49
+ stft = STFT()
50
+ _torch_stft = torch.stft
51
+
52
+ def z_stft(input: torch.Tensor, window: torch.Tensor, *args, **kwargs):
53
+ # only optimizing a specific call from rvc.train.mel_processing.MultiScaleMelSpectrogramLoss
54
+ if (
55
+ kwargs.get("win_length") == None
56
+ and kwargs.get("center") == None
57
+ and kwargs.get("return_complex") == True
58
+ ):
59
+ # use GPU accelerated calculation
60
+ return stft.transform(
61
+ input,
62
+ kwargs.get("n_fft"),
63
+ kwargs.get("hop_length"),
64
+ window,
65
+ )
66
+ # simply do the operation on CPU
67
+ return _torch_stft(
68
+ input=input.cpu(),
69
+ window=window.cpu(),
70
+ *args,
71
+ **kwargs,
72
+ ).to(input.device)
73
+
74
+ def z_jit(f, *_, **__):
75
+ f.graph = torch._C.Graph()
76
+ return f
77
+
78
+ # hijacks
79
+ torch.stft = z_stft
80
+ torch.jit.script = z_jit
81
+ # disabling unsupported cudnn
82
+ torch.backends.cudnn.enabled = False
83
+ torch.backends.cuda.enable_flash_sdp(False)
84
+ torch.backends.cuda.enable_math_sdp(True)
85
+ torch.backends.cuda.enable_mem_efficient_sdp(False)
rvc_logic/rvc/train/anyprecision_optimizer.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # AnyPrecisionAdamW: a flexible precision AdamW optimizer
8
+ # with optional Kahan summation for high precision weight updates.
9
+ # Allows direct control over momentum, variance and auxiliary compensation
10
+ # buffer dtypes.
11
+ # Optional Kahan summation is used to offset precision reduction for
12
+ # the weight updates. This allows full training in BFloat16 (equal or
13
+ # better than FP32 results in many cases) due to high precision weight updates.
14
+
15
+ import torch
16
+ from torch.optim.optimizer import Optimizer
17
+
18
+
19
+ class AnyPrecisionAdamW(Optimizer):
20
+ def __init__(
21
+ self,
22
+ params,
23
+ lr=1e-3,
24
+ betas=(0.9, 0.999),
25
+ eps=1e-8,
26
+ weight_decay=0.0,
27
+ use_kahan_summation=True, # NOTE default upstream is True
28
+ momentum_dtype=torch.bfloat16, # NOTE default upstream is torch.float32,
29
+ variance_dtype=torch.bfloat16,
30
+ compensation_buffer_dtype=torch.bfloat16,
31
+ ):
32
+ """
33
+ Args:
34
+ params (iterable): iterable of parameters to optimize or dicts defining
35
+ parameter groups
36
+ lr (float, optional): learning rate (default: 1e-3)
37
+ betas (Tuple[float, float], optional): coefficients used for computing
38
+ running averages of gradient and its square (default: (0.9, 0.999))
39
+ eps (float, optional): term added to the denominator to improve
40
+ numerical stability (default: 1e-8)
41
+ weight_decay (float, optional): weight decay coefficient (default: 1e-2)
42
+
43
+ # Any Precision specific
44
+ use_kahan_summation = creates auxiliary buffer to ensure high precision
45
+ model param updates (default: True)
46
+ momentum_dtype = dtype for momentum (default: torch.bfloat16)
47
+ variance_dtype = dtype for uncentered variance (default: torch.bfloat16)
48
+ compensation_buffer_dtype = dtype for Kahan summation
49
+ buffer (default: torch.bfloat16). Only used if
50
+ ``use_kahan_summation=True``.
51
+
52
+ # Usage
53
+ This optimizer implements optimizer states, and Kahan summation
54
+ for high precision updates, all in user controlled dtypes.
55
+ Defaults are variance in BF16, Momentum in BF16.
56
+ This can be run in FSDP mixed precision, amp, or full precision,
57
+ depending on what training pipeline you wish to work with.
58
+
59
+ Setting to use_kahan_summation = False, and changing momentum and
60
+ variance dtypes to FP32, reverts this to a standard AdamW optimizer.
61
+
62
+ """
63
+ defaults = dict(
64
+ lr=lr,
65
+ betas=betas,
66
+ eps=eps,
67
+ weight_decay=weight_decay,
68
+ use_kahan_summation=use_kahan_summation,
69
+ momentum_dtype=momentum_dtype,
70
+ variance_dtype=variance_dtype,
71
+ compensation_buffer_dtype=compensation_buffer_dtype,
72
+ )
73
+
74
+ super().__init__(params, defaults)
75
+
76
+ @torch.no_grad()
77
+ def step(self, closure=None):
78
+ """
79
+ Performs a single optimization step.
80
+
81
+ Args:
82
+ closure (callable, optional): A closure that reevaluates the model
83
+ and returns the loss.
84
+
85
+ """
86
+ if closure is not None:
87
+ with torch.enable_grad():
88
+ # to fix linter, we do not keep the returned loss for use atm.
89
+ closure()
90
+
91
+ for group in self.param_groups:
92
+
93
+ beta1, beta2 = group["betas"]
94
+ lr = group["lr"]
95
+ weight_decay = group["weight_decay"]
96
+ eps = group["eps"]
97
+ use_kahan_summation = group["use_kahan_summation"]
98
+
99
+ momentum_dtype = group["momentum_dtype"]
100
+ variance_dtype = group["variance_dtype"]
101
+ compensation_buffer_dtype = group["compensation_buffer_dtype"]
102
+
103
+ for p in group["params"]:
104
+ if p.grad is None:
105
+ continue
106
+
107
+ if p.grad.is_sparse:
108
+ raise RuntimeError(
109
+ "AnyPrecisionAdamW does not support sparse gradients"
110
+ )
111
+
112
+ state = self.state[p]
113
+
114
+ # State initialization
115
+ if len(state) == 0:
116
+
117
+ state["step"] = torch.tensor(0.0)
118
+
119
+ # momentum - EMA of gradient values
120
+ state["exp_avg"] = torch.zeros_like(
121
+ p,
122
+ dtype=momentum_dtype,
123
+ )
124
+
125
+ # variance uncentered - EMA of squared gradient values
126
+ state["exp_avg_sq"] = torch.zeros_like(
127
+ p,
128
+ dtype=variance_dtype,
129
+ )
130
+
131
+ # optional Kahan summation - accumulated error tracker
132
+ if use_kahan_summation:
133
+ state["compensation"] = torch.zeros_like(
134
+ p,
135
+ dtype=compensation_buffer_dtype,
136
+ )
137
+
138
+ # main processing -------------------------
139
+
140
+ # update the steps for each param group update
141
+ state["step"] += 1
142
+ step = state["step"]
143
+
144
+ exp_avg = state["exp_avg"]
145
+ exp_avg_sq = state["exp_avg_sq"]
146
+
147
+ grad = p.grad
148
+
149
+ # weight decay, AdamW style
150
+ if weight_decay:
151
+ p.data.mul_(1 - lr * weight_decay)
152
+
153
+ # update momentum
154
+ exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
155
+
156
+ # update uncentered variance
157
+ exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
158
+
159
+ # adjust using bias1
160
+ bias_correction1 = 1 - beta1**step
161
+
162
+ step_size = lr / bias_correction1
163
+
164
+ # adjust using bias2
165
+ denom_correction = (1 - beta2**step) ** 0.5 # avoids math import
166
+
167
+ centered_variance = (exp_avg_sq.sqrt() / denom_correction).add_(
168
+ eps, alpha=1
169
+ )
170
+
171
+ # lr update to compensation
172
+ if use_kahan_summation:
173
+ compensation = state["compensation"]
174
+
175
+ compensation.addcdiv_(exp_avg, centered_variance, value=-step_size)
176
+
177
+ # update weights with compensation (Kahan summation)
178
+ # save error back to compensation for next iteration
179
+ temp_buffer = p.detach().clone()
180
+ p.data.add_(compensation)
181
+ compensation.add_(temp_buffer.sub_(p.data))
182
+
183
+ else:
184
+ # usual AdamW updates
185
+ p.data.addcdiv_(exp_avg, centered_variance, value=-step_size)
rvc_logic/rvc/train/data_utils.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+
3
+ import numpy as np
4
+
5
+ import torch
6
+ import torch.utils.data
7
+
8
+ from rvc_logic.rvc.train.mel_processing import spectrogram_torch
9
+ from rvc_logic.rvc.train.utils import load_filepaths_and_text, load_wav_to_torch
10
+
11
+
12
+ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
13
+ """
14
+ Dataset that loads text and audio pairs.
15
+
16
+ Args:
17
+ hparams: Hyperparameters.
18
+
19
+ """
20
+
21
+ def __init__(self, hparams):
22
+ self.audiopaths_and_text = load_filepaths_and_text(hparams.training_files)
23
+ self.max_wav_value = hparams.max_wav_value
24
+ self.sample_rate = hparams.sample_rate
25
+ self.filter_length = hparams.filter_length
26
+ self.hop_length = hparams.hop_length
27
+ self.win_length = hparams.win_length
28
+ self.sample_rate = hparams.sample_rate
29
+ self.min_text_len = getattr(hparams, "min_text_len", 1)
30
+ self.max_text_len = getattr(hparams, "max_text_len", 5000)
31
+ self._filter()
32
+
33
+ def _filter(self):
34
+ """
35
+ Filters audio paths and text pairs based on text length.
36
+ """
37
+ audiopaths_and_text_new = []
38
+ lengths = []
39
+ for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text:
40
+ if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
41
+ audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv])
42
+ lengths.append(
43
+ pathlib.Path(audiopath).stat().st_size // (3 * self.hop_length)
44
+ )
45
+ self.audiopaths_and_text = audiopaths_and_text_new
46
+ self.lengths = lengths
47
+
48
+ def get_sid(self, sid):
49
+ """
50
+ Converts speaker ID to a LongTensor.
51
+
52
+ Args:
53
+ sid (str): Speaker ID.
54
+
55
+ """
56
+ try:
57
+ sid = torch.LongTensor([int(sid)])
58
+ except ValueError as error:
59
+ print(f"Error converting speaker ID '{sid}' to integer. Exception: {error}")
60
+ sid = torch.LongTensor([0])
61
+ return sid
62
+
63
+ def get_audio_text_pair(self, audiopath_and_text):
64
+ """
65
+ Loads and processes audio and text data for a single pair.
66
+
67
+ Args:
68
+ audiopath_and_text (list): List containing audio path, text, pitch, pitchf, and speaker ID.
69
+
70
+ """
71
+ file = audiopath_and_text[0]
72
+ phone = audiopath_and_text[1]
73
+ pitch = audiopath_and_text[2]
74
+ pitchf = audiopath_and_text[3]
75
+ dv = audiopath_and_text[4]
76
+
77
+ phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf)
78
+ spec, wav = self.get_audio(file)
79
+ dv = self.get_sid(dv)
80
+
81
+ len_phone = phone.size()[0]
82
+ len_spec = spec.size()[-1]
83
+ if len_phone != len_spec:
84
+ len_min = min(len_phone, len_spec)
85
+ len_wav = len_min * self.hop_length
86
+
87
+ spec = spec[:, :len_min]
88
+ wav = wav[:, :len_wav]
89
+
90
+ phone = phone[:len_min, :]
91
+ pitch = pitch[:len_min]
92
+ pitchf = pitchf[:len_min]
93
+
94
+ return (spec, wav, phone, pitch, pitchf, dv)
95
+
96
+ def get_labels(self, phone, pitch, pitchf):
97
+ """
98
+ Loads and processes phoneme, pitch, and pitchf labels.
99
+
100
+ Args:
101
+ phone (str): Path to phoneme label file.
102
+ pitch (str): Path to pitch label file.
103
+ pitchf (str): Path to pitchf label file.
104
+
105
+ """
106
+ phone = np.load(phone)
107
+ phone = np.repeat(phone, 2, axis=0)
108
+ pitch = np.load(pitch)
109
+ pitchf = np.load(pitchf)
110
+ n_num = min(phone.shape[0], 900)
111
+ phone = phone[:n_num, :]
112
+ pitch = pitch[:n_num]
113
+ pitchf = pitchf[:n_num]
114
+ phone = torch.FloatTensor(phone)
115
+ pitch = torch.LongTensor(pitch)
116
+ pitchf = torch.FloatTensor(pitchf)
117
+ return phone, pitch, pitchf
118
+
119
+ def get_audio(self, filename):
120
+ """
121
+ Loads and processes audio data.
122
+
123
+ Args:
124
+ filename (str): Path to audio file.
125
+
126
+ """
127
+ audio, sample_rate = load_wav_to_torch(filename)
128
+ if sample_rate != self.sample_rate:
129
+ raise ValueError(
130
+ f"{sample_rate} SR doesn't match target {self.sample_rate} SR",
131
+ )
132
+ audio_norm = audio
133
+ audio_norm = audio_norm.unsqueeze(0)
134
+ spec_filename = filename.replace(".wav", ".spec.pt")
135
+ if pathlib.Path(spec_filename).exists():
136
+ try:
137
+ spec = torch.load(spec_filename, weights_only=False)
138
+ except Exception as error:
139
+ print(f"An error occurred getting spec from {spec_filename}: {error}")
140
+ spec = spectrogram_torch(
141
+ audio_norm,
142
+ self.filter_length,
143
+ self.hop_length,
144
+ self.win_length,
145
+ center=False,
146
+ )
147
+ spec = torch.squeeze(spec, 0)
148
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
149
+ else:
150
+ spec = spectrogram_torch(
151
+ audio_norm,
152
+ self.filter_length,
153
+ self.hop_length,
154
+ self.win_length,
155
+ center=False,
156
+ )
157
+ spec = torch.squeeze(spec, 0)
158
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
159
+ return spec, audio_norm
160
+
161
+ def __getitem__(self, index):
162
+ """
163
+ Returns a single audio-text pair.
164
+
165
+ Args:
166
+ index (int): Index of the data sample.
167
+
168
+ """
169
+ return self.get_audio_text_pair(self.audiopaths_and_text[index])
170
+
171
+ def __len__(self):
172
+ """
173
+ Returns the length of the dataset.
174
+ """
175
+ return len(self.audiopaths_and_text)
176
+
177
+
178
+ class TextAudioCollateMultiNSFsid:
179
+ """
180
+ Collates text and audio data for training.
181
+
182
+ Args:
183
+ return_ids (bool, optional): Whether to return sample IDs. Defaults to False.
184
+
185
+ """
186
+
187
+ def __init__(self, return_ids=False):
188
+ self.return_ids = return_ids
189
+
190
+ def __call__(self, batch):
191
+ """
192
+ Collates a batch of data samples.
193
+
194
+ Args:
195
+ batch (list): List of data samples.
196
+
197
+ """
198
+ _, ids_sorted_decreasing = torch.sort(
199
+ torch.LongTensor([x[0].size(1) for x in batch]),
200
+ dim=0,
201
+ descending=True,
202
+ )
203
+
204
+ max_spec_len = max([x[0].size(1) for x in batch])
205
+ max_wave_len = max([x[1].size(1) for x in batch])
206
+ spec_lengths = torch.LongTensor(len(batch))
207
+ wave_lengths = torch.LongTensor(len(batch))
208
+ spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
209
+ wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len)
210
+ spec_padded.zero_()
211
+ wave_padded.zero_()
212
+
213
+ max_phone_len = max([x[2].size(0) for x in batch])
214
+ phone_lengths = torch.LongTensor(len(batch))
215
+ phone_padded = torch.FloatTensor(
216
+ len(batch),
217
+ max_phone_len,
218
+ batch[0][2].shape[1],
219
+ )
220
+ pitch_padded = torch.LongTensor(len(batch), max_phone_len)
221
+ pitchf_padded = torch.FloatTensor(len(batch), max_phone_len)
222
+ phone_padded.zero_()
223
+ pitch_padded.zero_()
224
+ pitchf_padded.zero_()
225
+ sid = torch.LongTensor(len(batch))
226
+
227
+ for i in range(len(ids_sorted_decreasing)):
228
+ row = batch[ids_sorted_decreasing[i]]
229
+
230
+ spec = row[0]
231
+ spec_padded[i, :, : spec.size(1)] = spec
232
+ spec_lengths[i] = spec.size(1)
233
+
234
+ wave = row[1]
235
+ wave_padded[i, :, : wave.size(1)] = wave
236
+ wave_lengths[i] = wave.size(1)
237
+
238
+ phone = row[2]
239
+ phone_padded[i, : phone.size(0), :] = phone
240
+ phone_lengths[i] = phone.size(0)
241
+
242
+ pitch = row[3]
243
+ pitch_padded[i, : pitch.size(0)] = pitch
244
+ pitchf = row[4]
245
+ pitchf_padded[i, : pitchf.size(0)] = pitchf
246
+
247
+ sid[i] = row[5]
248
+
249
+ return (
250
+ phone_padded,
251
+ phone_lengths,
252
+ pitch_padded,
253
+ pitchf_padded,
254
+ spec_padded,
255
+ spec_lengths,
256
+ wave_padded,
257
+ wave_lengths,
258
+ sid,
259
+ )
260
+
261
+
262
+ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
263
+ """
264
+ Distributed sampler that groups data into buckets based on length.
265
+
266
+ Args:
267
+ dataset (torch.utils.data.Dataset): Dataset to sample from.
268
+ batch_size (int): Batch size.
269
+ boundaries (list): List of length boundaries for buckets.
270
+ num_replicas (int, optional): Number of processes participating in distributed training. Defaults to None.
271
+ rank (int, optional): Rank of the current process. Defaults to None.
272
+ shuffle (bool, optional): Whether to shuffle the data. Defaults to True.
273
+
274
+ """
275
+
276
+ def __init__(
277
+ self,
278
+ dataset,
279
+ batch_size,
280
+ boundaries,
281
+ num_replicas=None,
282
+ rank=None,
283
+ shuffle=True,
284
+ ):
285
+ super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
286
+ self.lengths = dataset.lengths
287
+ self.batch_size = batch_size
288
+ self.boundaries = boundaries
289
+
290
+ self.buckets, self.num_samples_per_bucket = self._create_buckets()
291
+ self.total_size = sum(self.num_samples_per_bucket)
292
+ self.num_samples = self.total_size // self.num_replicas
293
+
294
+ def _create_buckets(self):
295
+ """
296
+ Creates buckets of data samples based on length.
297
+ """
298
+ buckets = [[] for _ in range(len(self.boundaries) - 1)]
299
+ for i in range(len(self.lengths)):
300
+ length = self.lengths[i]
301
+ idx_bucket = self._bisect(length)
302
+ if idx_bucket != -1:
303
+ buckets[idx_bucket].append(i)
304
+
305
+ for i in range(len(buckets) - 1, -1, -1):
306
+ if len(buckets[i]) == 0:
307
+ buckets.pop(i)
308
+ self.boundaries.pop(i + 1)
309
+
310
+ num_samples_per_bucket = []
311
+ for i in range(len(buckets)):
312
+ len_bucket = len(buckets[i])
313
+ total_batch_size = self.num_replicas * self.batch_size
314
+ rem = (
315
+ total_batch_size - (len_bucket % total_batch_size)
316
+ ) % total_batch_size
317
+ num_samples_per_bucket.append(len_bucket + rem)
318
+ return buckets, num_samples_per_bucket
319
+
320
+ def __iter__(self):
321
+ """
322
+ Iterates over batches of data samples.
323
+ """
324
+ g = torch.Generator()
325
+ g.manual_seed(self.epoch)
326
+
327
+ indices = []
328
+ if self.shuffle:
329
+ for bucket in self.buckets:
330
+ indices.append(torch.randperm(len(bucket), generator=g).tolist())
331
+ else:
332
+ for bucket in self.buckets:
333
+ indices.append(list(range(len(bucket))))
334
+
335
+ batches = []
336
+ for i in range(len(self.buckets)):
337
+ bucket = self.buckets[i]
338
+ len_bucket = len(bucket)
339
+ ids_bucket = indices[i]
340
+ num_samples_bucket = self.num_samples_per_bucket[i]
341
+
342
+ rem = num_samples_bucket - len_bucket
343
+ ids_bucket = (
344
+ ids_bucket
345
+ + ids_bucket * (rem // len_bucket)
346
+ + ids_bucket[: (rem % len_bucket)]
347
+ )
348
+
349
+ ids_bucket = ids_bucket[self.rank :: self.num_replicas]
350
+
351
+ # batching
352
+ for j in range(len(ids_bucket) // self.batch_size):
353
+ batch = [
354
+ bucket[idx]
355
+ for idx in ids_bucket[
356
+ j * self.batch_size : (j + 1) * self.batch_size
357
+ ]
358
+ ]
359
+ batches.append(batch)
360
+
361
+ if self.shuffle:
362
+ batch_ids = torch.randperm(len(batches), generator=g).tolist()
363
+ batches = [batches[i] for i in batch_ids]
364
+ self.batches = batches
365
+
366
+ assert len(self.batches) * self.batch_size == self.num_samples
367
+ return iter(self.batches)
368
+
369
+ def _bisect(self, x, lo=0, hi=None):
370
+ """
371
+ Performs binary search to find the bucket index for a given length.
372
+
373
+ Args:
374
+ x (int): Length to find the bucket for.
375
+ lo (int, optional): Lower bound of the search range. Defaults to 0.
376
+ hi (int, optional): Upper bound of the search range. Defaults to None.
377
+
378
+ """
379
+ if hi is None:
380
+ hi = len(self.boundaries) - 1
381
+
382
+ if hi > lo:
383
+ mid = (hi + lo) // 2
384
+ if self.boundaries[mid] < x <= self.boundaries[mid + 1]:
385
+ return mid
386
+ if x <= self.boundaries[mid]:
387
+ return self._bisect(x, lo, mid)
388
+ return self._bisect(x, mid + 1, hi)
389
+ return -1
390
+
391
+ def __len__(self):
392
+ """
393
+ Returns the length of the sampler.
394
+ """
395
+ return self.num_samples // self.batch_size
396
+ size