| seed_everything: 3407 |
|
|
| data: |
| class_path: decoder.dataset.VocosDataModule |
| init_args: |
| train_params: |
| filelist_path: ./WavTokenizer/data/train/libritts_train |
| sampling_rate: 24000 |
| num_samples: 72000 |
| batch_size: 40 |
| num_workers: 8 |
|
|
| val_params: |
| filelist_path: ./WavTokenizer/data/infer/librttts_val |
| sampling_rate: 24000 |
| num_samples: 72000 |
| batch_size: 5 |
| num_workers: 8 |
|
|
| model: |
| class_path: decoder.experiment.WavTokenizer |
| init_args: |
| sample_rate: 24000 |
| initial_learning_rate: 2e-4 |
| mel_loss_coeff: 45 |
| mrd_loss_coeff: 1.0 |
| num_warmup_steps: 0 |
| pretrain_mel_steps: 0 |
|
|
| |
| evaluate_utmos: true |
| evaluate_pesq: true |
| evaluate_periodicty: true |
|
|
| resume: false |
| resume_config: ./WavTokenizer/configs/wavtokenizer_smalldata_frame75_3s_nq1_code16384_dim512_kmeans800_attn.yaml |
| resume_model: ./WavTokenizer/result/train/wavtokenizer_smalldata_frame75_3s_nq1_code16384_dim512_kmeans800_attn/xxx.ckpt |
|
|
| feature_extractor: |
| class_path: decoder.feature_extractors.EncodecFeatures |
| init_args: |
| encodec_model: encodec_24khz |
| bandwidths: [6.6, 6.6, 6.6, 6.6] |
| train_codebooks: true |
| num_quantizers: 1 |
| dowmsamples: [8, 5, 4, 2] |
| vq_bins: 4096 |
| vq_kmeans: 200 |
|
|
| backbone: |
| class_path: decoder.models.VocosBackbone |
| init_args: |
| input_channels: 512 |
| dim: 768 |
| intermediate_dim: 2304 |
| num_layers: 12 |
| adanorm_num_embeddings: 4 |
|
|
| head: |
| class_path: decoder.heads.ISTFTHead |
| init_args: |
| dim: 768 |
| n_fft: 1280 |
| hop_length: 320 |
| padding: same |
|
|
| trainer: |
| logger: |
| class_path: pytorch_lightning.loggers.TensorBoardLogger |
| init_args: |
| save_dir: ./WavTokenizer/result/train/wavtokenizer_smalldata_frame75_3s_nq1_code4096_dim512_kmeans200_attn/ |
| callbacks: |
| - class_path: pytorch_lightning.callbacks.LearningRateMonitor |
| - class_path: pytorch_lightning.callbacks.ModelSummary |
| init_args: |
| max_depth: 2 |
| - class_path: pytorch_lightning.callbacks.ModelCheckpoint |
| init_args: |
| monitor: val_loss |
| filename: wavtokenizer_checkpoint_{epoch}_{step}_{val_loss:.4f} |
| save_top_k: 10 |
| save_last: true |
| - class_path: decoder.helpers.GradNormCallback |
|
|
| |
| |
| max_steps: 2000000 |
| |
| limit_val_batches: 100 |
| accelerator: gpu |
| strategy: ddp |
| devices: [0,1,2,3,4,5,6,7] |
| log_every_n_steps: 1000 |
| |