| feature_extractor: | |
| class_path: model.vocos.feature_extractors.MelSpectrogramFeatures | |
| init_args: | |
| sample_rate: 16000 | |
| n_fft: 512 | |
| n_win: 512 | |
| n_hop: 128 | |
| n_mels: 80 | |
| f_min: 0 | |
| f_max: 8000 | |
| power: 2 | |
| center: true | |
| normalize: false | |
| onesided: true | |
| mel_norm: slaney | |
| mel_scale: slaney | |
| librosa_mel: true | |
| clip_val: 0.00001 | |
| backbone: | |
| class_path: model.vocos.models.VocosBackbone | |
| init_args: | |
| input_channels: 80 | |
| dim: 512 | |
| intermediate_dim: 1536 | |
| num_layers: 8 | |
| layer_scale_init_value: null | |
| adanorm_num_embeddings: null | |
| head: | |
| class_path: model.vocos.heads.ISTFTHead | |
| init_args: | |
| dim: 512 | |
| n_fft: 512 | |
| hop_length: 128 | |
| padding: center | |
| sample_rate: 16000 | |
| initial_learning_rate: 0.0005 | |
| num_warmup_steps: 0 | |
| mel_loss_coeff: 45.0 | |
| mrd_loss_coeff: 0.1 | |
| pretrain_mel_steps: 0 | |
| decay_mel_coeff: false | |
| evaluate_utmos: true | |
| evaluate_pesq: true | |
| evaluate_periodicty: true |