| |
| import os |
| from trainer import Trainer, TrainerArgs |
| from TTS.utils.audio import AudioProcessor |
| from TTS.config.shared_configs import BaseAudioConfig |
| from TTS.vocoder.configs import HifiganConfig |
| from TTS.vocoder.datasets.preprocess import load_wav_data |
| from TTS.vocoder.models.gan import GAN |
|
|
|
|
| def main(): |
| output_path = os.path.dirname(os.path.abspath(__file__)) |
| data_path = os.path.join(output_path, "LJSpeech-1.1/wavs/") |
|
|
| audio_config = BaseAudioConfig( |
| sample_rate=22050, |
| resample=False, |
| do_trim_silence=True, |
| trim_db=45, |
|
|
| fft_size=1024, |
| win_length=1024, |
| hop_length=256, |
| frame_shift_ms=None, |
| frame_length_ms=None, |
|
|
| num_mels=80, |
| mel_fmin=0.0, |
| mel_fmax=None, |
|
|
| signal_norm=True, |
| symmetric_norm=True, |
| max_norm=4.0, |
| clip_norm=True, |
| ref_level_db=20, |
| min_level_db=-100, |
| spec_gain=20.0, |
| log_func="np.log10", |
| preemphasis=0.0, |
|
|
| stats_path=None, |
| ) |
|
|
| config = HifiganConfig( |
| run_name="hifigan_ljspeech", |
| run_description="HiFi-GAN v1 from scratch, GlowTTS-compatible mels", |
|
|
| data_path=data_path, |
| output_path=output_path, |
| eval_split_size=10, |
|
|
| audio=audio_config, |
|
|
| epochs=2000, |
| batch_size=64, |
| eval_batch_size=16, |
| num_loader_workers=4, |
| num_eval_loader_workers=2, |
| run_eval=True, |
| test_delay_epochs=5, |
| mixed_precision=True, |
|
|
| seq_len=8192, |
| pad_short=2000, |
| use_noise_augment=True, |
|
|
| lr_gen=2e-4, |
| lr_disc=2e-4, |
|
|
| print_step=50, |
| print_eval=False, |
| save_step=5000, |
| save_n_checkpoints=5, |
| save_checkpoints=True, |
| log_model_step=10000, |
| plot_step=500, |
| ) |
|
|
| ap = AudioProcessor(config=config.audio) |
|
|
| eval_samples, train_samples = load_wav_data( |
| config.data_path, |
| config.eval_split_size, |
| ) |
|
|
| model = GAN(config) |
|
|
| trainer = Trainer( |
| TrainerArgs(), |
| config, |
| output_path, |
| model=model, |
| train_samples=train_samples, |
| eval_samples=eval_samples, |
| training_assets={"audio_processor": ap}, |
| ) |
|
|
| trainer.fit() |
|
|
|
|
| if __name__ == "__main__": |
| main() |