Flare-TTS-v1.5 / train_vocoder.py
LH-Tech-AI's picture
Create train_vocoder.py
ea3793d verified
# train_vocoder.py
import os
from trainer import Trainer, TrainerArgs
from TTS.utils.audio import AudioProcessor
from TTS.config.shared_configs import BaseAudioConfig
from TTS.vocoder.configs import HifiganConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.gan import GAN
def main():
output_path = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.join(output_path, "LJSpeech-1.1/wavs/")
audio_config = BaseAudioConfig(
sample_rate=22050,
resample=False,
do_trim_silence=True,
trim_db=45,
fft_size=1024,
win_length=1024,
hop_length=256,
frame_shift_ms=None,
frame_length_ms=None,
num_mels=80,
mel_fmin=0.0,
mel_fmax=None,
signal_norm=True,
symmetric_norm=True,
max_norm=4.0,
clip_norm=True,
ref_level_db=20,
min_level_db=-100,
spec_gain=20.0,
log_func="np.log10",
preemphasis=0.0,
stats_path=None,
)
config = HifiganConfig(
run_name="hifigan_ljspeech",
run_description="HiFi-GAN v1 from scratch, GlowTTS-compatible mels",
data_path=data_path,
output_path=output_path,
eval_split_size=10,
audio=audio_config,
epochs=2000,
batch_size=64,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=2,
run_eval=True,
test_delay_epochs=5,
mixed_precision=True,
seq_len=8192,
pad_short=2000,
use_noise_augment=True,
lr_gen=2e-4,
lr_disc=2e-4,
print_step=50,
print_eval=False,
save_step=5000,
save_n_checkpoints=5,
save_checkpoints=True,
log_model_step=10000,
plot_step=500,
)
ap = AudioProcessor(config=config.audio)
eval_samples, train_samples = load_wav_data(
config.data_path,
config.eval_split_size,
)
model = GAN(config)
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()
if __name__ == "__main__":
main()