# train_vocoder.py import os from trainer import Trainer, TrainerArgs from TTS.utils.audio import AudioProcessor from TTS.config.shared_configs import BaseAudioConfig from TTS.vocoder.configs import HifiganConfig from TTS.vocoder.datasets.preprocess import load_wav_data from TTS.vocoder.models.gan import GAN def main(): output_path = os.path.dirname(os.path.abspath(__file__)) data_path = os.path.join(output_path, "LJSpeech-1.1/wavs/") audio_config = BaseAudioConfig( sample_rate=22050, resample=False, do_trim_silence=True, trim_db=45, fft_size=1024, win_length=1024, hop_length=256, frame_shift_ms=None, frame_length_ms=None, num_mels=80, mel_fmin=0.0, mel_fmax=None, signal_norm=True, symmetric_norm=True, max_norm=4.0, clip_norm=True, ref_level_db=20, min_level_db=-100, spec_gain=20.0, log_func="np.log10", preemphasis=0.0, stats_path=None, ) config = HifiganConfig( run_name="hifigan_ljspeech", run_description="HiFi-GAN v1 from scratch, GlowTTS-compatible mels", data_path=data_path, output_path=output_path, eval_split_size=10, audio=audio_config, epochs=2000, batch_size=64, eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=2, run_eval=True, test_delay_epochs=5, mixed_precision=True, seq_len=8192, pad_short=2000, use_noise_augment=True, lr_gen=2e-4, lr_disc=2e-4, print_step=50, print_eval=False, save_step=5000, save_n_checkpoints=5, save_checkpoints=True, log_model_step=10000, plot_step=500, ) ap = AudioProcessor(config=config.audio) eval_samples, train_samples = load_wav_data( config.data_path, config.eval_split_size, ) model = GAN(config) trainer = Trainer( TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, ) trainer.fit() if __name__ == "__main__": main()