| log_dir: ./Models/Finetune
|
| save_freq: 1
|
| log_interval: 10
|
| device: cuda
|
| epochs: 50
|
| batch_size: 2
|
| max_len: 310
|
| pretrained_model: ./Models/Finetune/base_model.pth
|
| load_only_params: false
|
| debug: true
|
|
|
| data_params:
|
| train_data: ../../Data_Speech/LibriTTS/train.txt
|
| val_data: ../../Data_Speech/LibriTTS/val.txt
|
| root_path: ../../Data_Speech/
|
|
|
| symbol:
|
| pad: "$"
|
| punctuation: ';:,.!?¡¿—…"«»“” '
|
| letters: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
| letters_ipa: "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
|
| extend: ""
|
|
|
| preprocess_params:
|
| sr: 24000
|
| spect_params:
|
| n_fft: 2048
|
| win_length: 1200
|
| hop_length: 300
|
|
|
| training_strats:
|
|
|
| freeze_modules: ['']
|
| ignore_modules: ['']
|
|
|
| model_params:
|
| dim_in: 64
|
| hidden_dim: 512
|
| max_conv_dim: 512
|
| n_layer: 3
|
| n_mels: 80
|
| max_dur: 50
|
| style_dim: 128
|
| dropout: 0.2
|
|
|
| ASR_params:
|
| input_dim: 80
|
| hidden_dim: 256
|
| n_layers: 6
|
| token_embedding_dim: 512
|
|
|
| JDC_params:
|
| num_class: 1
|
| seq_len: 192
|
|
|
|
|
| decoder:
|
| type: hifigan
|
| resblock_kernel_sizes: [3,7,11]
|
| upsample_rates : [10,5,3,2]
|
| upsample_initial_channel: 512
|
| resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
| upsample_kernel_sizes: [20,10,6,4]
|
|
|
| loss_params:
|
| lambda_mel: 5.
|
| lambda_gen: 1.
|
|
|
| lambda_mono: 1.
|
| lambda_s2s: 1.
|
|
|
| lambda_F0: 1.
|
| lambda_norm: 1.
|
| lambda_dur: 1.
|
| lambda_ce: 20.
|
|
|
| optimizer_params:
|
| lr: 0.0001
|
| ft_lr: 0.00001 |