| accumulate_grad_batches: 1 | |
| base_config: '' | |
| batch_max_tokens: 4000 | |
| batch_size: 5 | |
| cfg_init: 1.0 | |
| cfg_scale: 4.0 | |
| cfg_schedule: linear | |
| check_val_every_n_epoch: 10 | |
| clip_grad_norm: 0.5 | |
| data_dir: '' | |
| datamodule_target: '' | |
| debug: false | |
| deep_speed_strategy_stage: 2 | |
| drop_last: true | |
| endless_ds: false | |
| exp_name: '' | |
| filter_args: | |
| lang: | |
| - zh | |
| - en | |
| max_spk_num: 6 | |
| speech_ratio: 0.6 | |
| gradient_clip_val: 1.0 | |
| indexed_ds: true | |
| infer: false | |
| infer_exp_name: '' | |
| infer_json_path: '' | |
| inference_ckpt: '' | |
| inference_mode: nonstreaming | |
| initialize_from: '' | |
| kimia_data_state_path: datastates/zeqian_ft.datastate | |
| learning_rate: 1e-4 | |
| limit_val_batches: 100 | |
| load_opt: false | |
| log_interval: 10 | |
| logger_type: tensorboard | |
| loss: | |
| mel_loss: l1 | |
| max_epochs: 1000 | |
| max_eval_sentences: -1 | |
| max_eval_tokens: -1 | |
| max_prompt_ratio: 0.5 | |
| max_segment_cnt: 20000 | |
| max_sentences: -1 | |
| max_speech_duration: 20 | |
| max_tokens: 31250 | |
| max_training_steps: 200000 | |
| max_updates: 160000 | |
| mel_mean: -4.479605 | |
| mel_std: 3.4584913 | |
| meta_dir: null | |
| min_prompt_duration: 0.1 | |
| min_speech_duration: -1 | |
| model: | |
| dit: | |
| chunk_params: | |
| hz: 50 | |
| max_chunk: 3.0 | |
| max_chunk_history: 500000 | |
| min_chunk: 0.5 | |
| need_block_shift: true | |
| depth: 10 | |
| ffn_act_layer: gleu_tanh | |
| ffn_conv_kernel_size: 5 | |
| ffn_gated_glu: false | |
| ffn_type: vanilla_mlp | |
| hidden_size: 2048 | |
| input_size: 80 | |
| max_seq_len: 4096 | |
| mlp_ratio: 4.0 | |
| num_heads: 16 | |
| position_embedding_type: skip | |
| prompt_cfg_dropout: 0.2 | |
| rope_params: | |
| max_position_embeddings: 4096 | |
| rope_base: 10000.0 | |
| rope_interpolation_factor: 1.0 | |
| semantic_cfg_dropout: 0.15 | |
| semantic_vocab_size: 8192 | |
| use_chunk_setting: true | |
| use_rope: true | |
| position_id_start_from: 0 | |
| random_position_start: true | |
| restart_position_ids: false | |
| upsample_args: | |
| rate: 1.0 | |
| need_merge_same_speaker: true | |
| no_verlap: true | |
| normalize_mel: true | |
| num_nodes: 4 | |
| num_sanity_val_steps: 0 | |
| num_workers: 3 | |
| ode_steps: 150 | |
| optimizer_adam_beta1: 0.9 | |
| optimizer_adam_beta2: 0.98 | |
| optimizer_class: adamw | |
| pin_memory: true | |
| precision: bf16-mixed | |
| save_topk: 10 | |
| seed: 1234 | |
| shuffle: true | |
| sort_by_len: true | |
| src_sample_rate: 16000 | |
| strategy: ddp | |
| tensorboard_dir: '' | |
| test_num: 100 | |
| tgt_sample_rate: 24000 | |
| timescale: 240000 | |
| use_cfg: false | |
| use_cfg_rescale: false | |
| use_chunk_setting: true | |
| use_distributed_sampler: false | |
| val_check_interval: 2000 | |
| vocoder_ckpt: '' | |
| vocoder_config_path: '' | |
| wandb_name: '' | |
| warmup_updates: 2000 | |
| weight_decay: 0.0001 | |
| work_dir: '' | |