| align_f0: false |
| align_loss_weight: 1.0 |
| asc_loss_weight: 0.02 |
| attention_mechanism: graves |
| augment_sr: false |
| base_model: null |
| bit_depth: 9 |
| causal_convs: false |
| causal_decoder: false |
| clap_dims: 512 |
| compat_dcnar_f0_std_cond: false |
| conv_stack_dilation: !!python/tuple |
| - 1 |
| - 3 |
| - 9 |
| - 27 |
| convbn_bias: false |
| cudnn_deterministic: false |
| dcnar_1d_discrim: false |
| dcnar_aligner_kernel: 5 |
| dcnar_aligner_type: null |
| dcnar_allow_trivial_speaker_table: true |
| dcnar_batch_size: 24 |
| dcnar_conformer: false |
| dcnar_conformer_attn_chunk_size: null |
| dcnar_conformer_attn_dim_head: 64 |
| dcnar_conformer_attn_ff_mult: 4 |
| dcnar_conformer_attn_win_size: null |
| dcnar_conv_weight_grouping: 1 |
| dcnar_df0_loss_weight: 0.5 |
| dcnar_dim_lrg: 512 |
| dcnar_dim_sml: 256 |
| dcnar_dim_style: 32 |
| dcnar_discrim_tanh: false |
| dcnar_dtw_loss_weight: 1 |
| dcnar_dur_loss_weight: 0.1 |
| dcnar_dur_pred_scale: linear |
| dcnar_f0_cond_mel_decoding: false |
| dcnar_f0_cond_mel_decoding_teacher_forcing: true |
| dcnar_f0_loss_weight: 0.5 |
| dcnar_gan_dims: 64 |
| dcnar_global_style: true |
| dcnar_hard_gumbel_tones: false |
| dcnar_hubert_downsample: 1 |
| dcnar_inpaint_vae: false |
| dcnar_inpaint_vae_kld_loss_weight: 0 |
| dcnar_inpaint_vae_latent_dim: 32 |
| dcnar_inpaint_vae_warmup_steps: 5000 |
| dcnar_inpaint_vae_weight_step_size: 0.0002 |
| dcnar_local_f0: false |
| dcnar_local_intensity: false |
| dcnar_local_style: false |
| dcnar_lr: 0.0001 |
| dcnar_mel_adv: false |
| dcnar_mel_loss_weight: 10.0 |
| dcnar_mixed_sr_loss: false |
| dcnar_n_terminal_tones: 0 |
| dcnar_ph_f0_loss_weight: 1.0 |
| dcnar_ph_hubert_loss_weight: 1.0 |
| dcnar_ph_intensity_loss_weight: 1.0 |
| dcnar_pitch_adv: false |
| dcnar_prosody_adv: false |
| dcnar_prosody_stats_cond: false |
| dcnar_pstat_weight_f0_mean: 10 |
| dcnar_pstat_weight_f0_std: 100 |
| dcnar_pstat_weight_intensity_mean: 10 |
| dcnar_pstat_weight_intensity_std: 0 |
| dcnar_pstat_weight_phdur_mean: 1 |
| dcnar_pstat_weight_phdur_std: 1 |
| dcnar_reverb_label: false |
| dcnar_sampler: default |
| dcnar_sr_label: false |
| dcnar_terminal_tone_usl_weight: 0 |
| dcnar_terminal_tone_weight: 0 |
| dcnar_upsampling: gaussian |
| dcnar_use_log_f0_frames: false |
| dcnar_use_toucan_utt_embs: false |
| dcnar_usl_mfcc: false |
| dcnar_usl_mfcc_deltas: false |
| dcnar_usl_mfcc_dim: 12 |
| dcnar_usl_mfcc_var_dec: false |
| dcnar_usl_slim: false |
| dcnar_usl_slim_dim: 16 |
| dcnar_usl_with_f0: false |
| dcnar_utt_dur_loss_weight: 0 |
| dcnar_vc_local_hubert: false |
| dcnar_vc_mode: nn |
| dcnar_vc_text_predict: false |
| dcnar_vuv_loss_weight: 0.5 |
| dcvoc_causal: false |
| dcvoc_causal_lookahead: 3 |
| dcvoc_channel_downsample_mode: interleave |
| dcvoc_convs_per_scale: 8 |
| dcvoc_disc_duplicates: 1 |
| dcvoc_disc_mpwd: true |
| dcvoc_disc_mrsd: false |
| dcvoc_disc_pdd: true |
| dcvoc_disc_phase_aug: false |
| dcvoc_discriminator_bound: 1.01 |
| dcvoc_groups_init: 8 |
| dcvoc_halfres_conv: true |
| dcvoc_hidden_init: 1024 |
| dcvoc_hop: 8 |
| dcvoc_kernel: 7 |
| dcvoc_mel_bneck: 256 |
| dcvoc_smpwd_hidden_max: 1024 |
| dcvoc_smpwd_periods: |
| - 2 |
| - 3 |
| - 5 |
| - 7 |
| - 9 |
| - 11 |
| - 13 |
| dcvoc_upsample_method: linear |
| denoise: false |
| dfd_clip_stft: 1.0e-09 |
| dfd_ramdisk_path: /mnt/ramdisk |
| ema_coeff: 0.99995 |
| emo_embedded_speaker_id: false |
| emotion_adv: false |
| enable_eos_bos_chars: true |
| encoder_type: voice_encoder |
| eval_crosslang: false |
| eval_langs: dataset |
| eval_max_ref_samples: 192 |
| eval_max_repeats: 1 |
| eval_max_runs: 10 |
| eval_max_sentences: 192 |
| eval_mbnet_name: null |
| eval_models_dir: saved_models |
| eval_n_plots: 2 |
| eval_n_wavs: 4 |
| eval_reference: train |
| eval_syn_batch_size: 64 |
| eval_text_source: default |
| eval_ve_name: universal/ve_v2 |
| eval_voc_max_frames: 2000 |
| eval_voc_name: null |
| f0_mode: praat |
| flatten_lstm_params: true |
| fmax: 16000 |
| fmin: 0 |
| frames_per_framegroup: 10 |
| freeze_mel_head: false |
| gmvae_ema_lr: 0.0001 |
| gmvae_latent_dim: 16 |
| gmvae_num_components: 0 |
| gpt_masked_loss: false |
| gpt_prod_max_text: 200 |
| gpt_speaker_ref_type: same_speaker |
| gpt_transformer_type: gpt2-medium |
| hifigan_channels: 256 |
| hooli_enc_dims: 256 |
| hooli_filter_size: 257 |
| hooli_inv_no_uv: false |
| hooli_inv_pitch_diff_reg_weight: 0 |
| hooli_inv_pitch_shift_reg_weight: 0 |
| hooli_nfft: 16 |
| hooli_osc_freq_cutoff: 0.15 |
| hooli_safe_step: true |
| hooli_tv_fir: false |
| hooli_wn_dims: 64 |
| hooligan_discriminators: univnet |
| hooligan_istft: true |
| hop_size: 320 |
| input_pos_emb: handled_internally_by_backbone |
| is_lora: false |
| language_embed_size: 16 |
| legacy_gpt_hidden_size: 1024 |
| lfcc_nfilts: 128 |
| llama_config_name: Llama_520M |
| lora_alpha: 64 |
| lora_dropout: 0.05 |
| lora_r: 32 |
| lossynet_bsize: 25 |
| lossynet_clip_stft: 1.0e-09 |
| lossynet_lr: 0.001 |
| lossynet_n_out_classes: 2 |
| lowest_sr: 8000 |
| max_LR: 0.001 |
| max_conditioning_inputs: 2 |
| max_decoder_frames: 2000 |
| max_f0_freq: 600 |
| max_speech_tokens: 604 |
| max_text_tokens: 402 |
| max_total_tokens: 8196 |
| mel_pad_difference: 1 |
| mel_power: 1.0 |
| mel_type: db |
| min_LR: 1.0e-06 |
| min_f0_freq: 75 |
| mpbert_n_freeze: 0 |
| mpbert_tokenizer: null |
| mpbert_type: transformer |
| mu_law: true |
| n_cqcc_bins: 96 |
| n_cqt_bins: 84 |
| n_fft: 2048 |
| n_gpt_channels: 1024 |
| n_reverbs: 256 |
| n_spk_cond_samples: 2 |
| n_state_per_symbol: 1 |
| n_transformer_heads: 16 |
| n_transformer_layers: 30 |
| normalize_loudness: false |
| normalized_mels: true |
| num_ceps: 29 |
| num_diacritcs: 512 |
| num_freq: 1025 |
| num_heads: 4 |
| num_mels: 256 |
| num_style_tokens: 0 |
| num_tones: 16 |
| onehot_language: false |
| onehot_speaker: false |
| pf_word_boundaries: false |
| phonemizer_backend: espeak |
| preemphasis: 0.97 |
| preemphasize_voc_target: false |
| prenet_type: original |
| project_conditioning: false |
| prosody_embed_size: 0 |
| r_schedule: |
| - - 1 |
| - -1 |
| rvc_emb_channels: 768 |
| rvc_enc_spk_input: false |
| rvc_f0_up: 0 |
| rvc_f0_voc: true |
| rvc_filter_channels: 768 |
| rvc_gin_channels: 256 |
| rvc_hidden_channels: 192 |
| rvc_inter_channels: 192 |
| rvc_kernel_size: 3 |
| rvc_mel_bins: 80 |
| rvc_n_heads: 2 |
| rvc_n_layers: 6 |
| rvc_p_dropout: 0 |
| rvc_resblock: '1' |
| rvc_resblock_dilation_sizes: |
| - - 1 |
| - 3 |
| - 5 |
| - - 1 |
| - 3 |
| - 5 |
| - - 1 |
| - 3 |
| - 5 |
| rvc_resblock_kernel_sizes: |
| - 3 |
| - 7 |
| - 11 |
| rvc_seg_enc_size_frames: 370 |
| rvc_seg_enc_size_samples: 118400 |
| rvc_seg_voc_size_frames: 40 |
| rvc_seg_voc_size_samples: 12800 |
| rvc_speaker_enc: table |
| rvc_speaker_enc_type: V1 |
| rvc_speaker_pitch: null |
| rvc_spec_channels: 513 |
| rvc_spk_embed_dim: 109 |
| rvc_stft_filter_len: 1024 |
| rvc_stft_win_len: 1024 |
| rvc_train_kl_weight: 1.0 |
| rvc_train_mel_weight: 45 |
| rvc_upsample_initial_channel: 512 |
| rvc_upsample_kernel_sizes: |
| - 20 |
| - 16 |
| - 4 |
| - 4 |
| rvc_upsample_rates: |
| - 10 |
| - 8 |
| - 2 |
| - 2 |
| rvc_use_f0: true |
| sample_rate: 32000 |
| scheduler_max_total_steps: 200000 |
| seed: 0 |
| self_conditioning: false |
| separate_stopnet: false |
| singing_dim: 4 |
| speaker_embed_size: 256 |
| speech_cond_prompt_len: 250 |
| speech_token_type: tortoise |
| speech_tokens_dict_size: 6563 |
| speed_scale: 0.1 |
| start_speech_token: 6561 |
| start_text_token: 255 |
| stepwise_sigmoid_noise: 2.0 |
| stft_magnitude_min: 0.0001 |
| stop_speech_token: 6562 |
| stop_text_token: 0 |
| stop_threshold: 0.25 |
| style_embed_size: 256 |
| supports_cfg: false |
| symbol_type: tortoise/data/gpt2_medium.json |
| syn_ar_f0_predict: true |
| syn_batch_frames: 16000 |
| syn_batch_size: 32 |
| syn_mel_scale: 1 |
| syn_predict_f0: true |
| syn_sampler: binnedlength |
| syn_symmetric_mel: false |
| syn_train_max_frames: 700 |
| syn_train_min_duration: 1 |
| taco1_postnet: true |
| taco_decoder_att_rnn_dim: 1024 |
| taco_decoder_prenet_dim: 256 |
| taco_decoder_rnn_dim: 1024 |
| taco_disjoint_conditioning: true |
| taco_encoder_dim: 512 |
| taco_grad_clip: 1 |
| taco_loss_masking: true |
| taco_lr: 0.0001 |
| taco_weight_decay: 1.0e-06 |
| target_loudness: -18 |
| text_loss_weight: 0.1 |
| text_preproc: none |
| text_tokens_dict_size: 50276 |
| ti_vocoder: false |
| toucan_utt_emb_dim: 704 |
| trim_silence: true |
| upsample_factors: !!python/tuple |
| - 5 |
| - 8 |
| - 8 |
| upsample_rate: null |
| upsamplenet_dropout: false |
| upsamplenet_lr: 1.0e-05 |
| use_adv_speaker_classifier: false |
| use_clap_embeds: false |
| use_diacritic: false |
| use_emotion_table: false |
| use_lamb_optimizer: false |
| use_language_table: false |
| use_monotonic_alignment: false |
| use_mpbert: false |
| use_one_cycle_lr: false |
| use_perceiver_resampler: false |
| use_pf: false |
| use_ph_durations: false |
| use_singing_labels: false |
| use_snr_labels: false |
| use_speaker_table: false |
| use_speech_codes_as_input: true |
| use_sv2tts: false |
| use_tb: false |
| use_tone: false |
| use_tpgst: false |
| use_wandb: false |
| vad_algo: webrtc |
| vad_margin: 0.1 |
| validate_sr: true |
| validate_wav_len: true |
| vc_mel2f0: false |
| vc_soft_gt_pitch: false |
| vc_soft_units: true |
| ve_final_relu: false |
| ve_hidden_size: 768 |
| ve_lr: 0.0001 |
| ve_min_samples: 20 |
| ve_partial_frames: 128 |
| ve_spk_batch_size: 128 |
| ve_utt_batch_size: 10 |
| voc_future_horizon: 11 |
| voc_lvc: false |
| voc_lvc_dims: 8 |
| voc_noise_fir: true |
| voc_subscale: 0 |
| voc_train_max_duration: 30 |
| voc_train_min_duration: 1.5 |
| voc_voiced_logits_scale: 0 |
| vocoder_bsize: 16 |
| vocoder_fc_dims: 512 |
| vocoder_hidden_size: 512 |
| vocoder_input_length: 16000 |
| vocoder_input_pad: 0 |
| vocoder_lr: 0.0001 |
| vocoder_mode: MOL |
| wandb_watch_model: false |
| webrtc_mode: 2 |
| weight_init: false |
| win_size: 2048 |
|
|