| log_dir: "./runs/run_dit_mel_seed_uvit_whisper_small_wavenet"
|
| save_freq: 1
|
| log_interval: 10
|
| save_interval: 1000
|
| device: "cuda"
|
| epochs: 1000
|
| batch_size: 2
|
| batch_length: 100
|
| max_len: 80
|
| pretrained_model: ""
|
| pretrained_encoder: "./temp_ckpt.pth"
|
| load_only_params: False
|
|
|
| preprocess_params:
|
| sr: 22050
|
| spect_params:
|
| n_fft: 1024
|
| win_length: 1024
|
| hop_length: 256
|
| n_mels: 80
|
| fmin: 0
|
| fmax: "None"
|
|
|
| model_params:
|
| dit_type: "DiT"
|
| reg_loss_type: "l1"
|
|
|
| timbre_shifter:
|
| se_db_path: "./modules/openvoice/checkpoints_v2/converter/se_db.pt"
|
| ckpt_path: './modules/openvoice/checkpoints_v2/converter'
|
|
|
| speech_tokenizer:
|
| type: 'whisper'
|
| name: "openai/whisper-small"
|
|
|
| style_encoder:
|
| dim: 192
|
| campplus_path: "campplus_cn_common.bin"
|
|
|
| vocoder:
|
| type: "bigvgan"
|
| name: "nvidia/bigvgan_v2_22khz_80band_256x"
|
|
|
| length_regulator:
|
| channels: 512
|
| is_discrete: false
|
| in_channels: 768
|
| content_codebook_size: 2048
|
| sampling_ratios: [1, 1, 1, 1]
|
| vector_quantize: false
|
| n_codebooks: 1
|
| quantizer_dropout: 0.0
|
| f0_condition: false
|
| n_f0_bins: 512
|
|
|
| DiT:
|
| hidden_dim: 512
|
| num_heads: 8
|
| depth: 13
|
| class_dropout_prob: 0.1
|
| block_size: 8192
|
| in_channels: 80
|
| style_condition: true
|
| final_layer_type: 'wavenet'
|
| target: 'mel'
|
| content_dim: 512
|
| content_codebook_size: 1024
|
| content_type: 'discrete'
|
| f0_condition: false
|
| n_f0_bins: 512
|
| content_codebooks: 1
|
| is_causal: false
|
| long_skip_connection: true
|
| zero_prompt_speech_token: false
|
| time_as_token: false
|
| style_as_token: false
|
| uvit_skip_connection: true
|
| add_resblock_in_transformer: false
|
|
|
| wavenet:
|
| hidden_dim: 512
|
| num_layers: 8
|
| kernel_size: 5
|
| dilation_rate: 1
|
| p_dropout: 0.2
|
| style_condition: true
|
|
|
| loss_params:
|
| base_lr: 0.0001
|
| lambda_mel: 45
|
| lambda_kl: 1.0 |