log_dir: "Models/Output" save_freq: 5 log_interval: 10 device: "cuda" epochs: 50 batch_size: 8 max_len: 400 pretrained_model: "" second_stage_load_pretrained: true load_only_params: true external_models: asr: input_dim: 80 hidden_dim: 256 n_token: 178 plbert: vocab_size: 178 hidden_size: 768 num_attention_heads: 12 intermediate_size: 2048 dropout: 0.1 data_params: train_data: "shethjenil/audiodata" root_path: "" min_length: 50 preprocess_params: sr: 24000 n_fft: 2048 win_length: 1200 hop_length: 300 model_params: multispeaker: true dim_in: 64 hidden_dim: 128 max_conv_dim: 512 n_layer: 2 n_mels: 80 n_token: 178 max_dur: 50 style_dim: 128 dropout: 0.2 decoder: type: "istftnet" hidden_dim: 256 decoder_out_dim: 256 asr_res_in: 128 resblock_kernel_sizes: [3, 3] upsample_rates: [10, 6] upsample_initial_channel: 256 resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5]] upsample_kernel_sizes: [20, 12] gen_istft_n_fft: 20 gen_istft_hop_size: 5 disable_complex: true slm: model: "microsoft/wavlm-base-plus" sr: 16000 hidden: 768 nlayers: 13 initial_channel: 64 diffusion: embedding_mask_proba: 0.1 transformer: num_layers: 3 num_heads: 8 head_features: 64 multiplier: 2 dist: sigma_data: 0.2 estimate_sigma_data: true mean: -3.0 std: 1.0 loss_params: lambda_mel: 5.0 lambda_gen: 1.0 lambda_slm: 1.0 lambda_mono: 1.0 lambda_s2s: 1.0 lambda_f0: 1.0 lambda_norm: 1.0 lambda_dur: 1.0 lambda_ce: 20.0 lambda_sty: 1.0 lambda_diff: 1.0 diff_epoch: 10 joint_epoch: 30 optimizer_params: lr: 0.0001 bert_lr: 0.00001 ft_lr: 0.0001 slmadv_params: min_len: 400 max_len: 500 batch_percentage: 0.5 iter: 10 thresh: 5.0 scale: 0.01 sig: 1.5