audio_detokenizer / config.yaml
mrfakename's picture
Upload folder using huggingface_hub
c943555 verified
accumulate_grad_batches: 1
base_config: ''
batch_max_tokens: 4000
batch_size: 5
cfg_init: 1.0
cfg_scale: 4.0
cfg_schedule: linear
check_val_every_n_epoch: 10
clip_grad_norm: 0.5
data_dir: ''
datamodule_target: ''
debug: false
deep_speed_strategy_stage: 2
drop_last: true
endless_ds: false
exp_name: ''
filter_args:
lang:
- zh
- en
max_spk_num: 6
speech_ratio: 0.6
gradient_clip_val: 1.0
indexed_ds: true
infer: false
infer_exp_name: ''
infer_json_path: ''
inference_ckpt: ''
inference_mode: nonstreaming
initialize_from: ''
kimia_data_state_path: datastates/zeqian_ft.datastate
learning_rate: 1e-4
limit_val_batches: 100
load_opt: false
log_interval: 10
logger_type: tensorboard
loss:
mel_loss: l1
max_epochs: 1000
max_eval_sentences: -1
max_eval_tokens: -1
max_prompt_ratio: 0.5
max_segment_cnt: 20000
max_sentences: -1
max_speech_duration: 20
max_tokens: 31250
max_training_steps: 200000
max_updates: 160000
mel_mean: -4.479605
mel_std: 3.4584913
meta_dir: null
min_prompt_duration: 0.1
min_speech_duration: -1
model:
dit:
chunk_params:
hz: 50
max_chunk: 3.0
max_chunk_history: 500000
min_chunk: 0.5
need_block_shift: true
depth: 10
ffn_act_layer: gleu_tanh
ffn_conv_kernel_size: 5
ffn_gated_glu: false
ffn_type: vanilla_mlp
hidden_size: 2048
input_size: 80
max_seq_len: 4096
mlp_ratio: 4.0
num_heads: 16
position_embedding_type: skip
prompt_cfg_dropout: 0.2
rope_params:
max_position_embeddings: 4096
rope_base: 10000.0
rope_interpolation_factor: 1.0
semantic_cfg_dropout: 0.15
semantic_vocab_size: 8192
use_chunk_setting: true
use_rope: true
position_id_start_from: 0
random_position_start: true
restart_position_ids: false
upsample_args:
rate: 1.0
need_merge_same_speaker: true
no_verlap: true
normalize_mel: true
num_nodes: 4
num_sanity_val_steps: 0
num_workers: 3
ode_steps: 150
optimizer_adam_beta1: 0.9
optimizer_adam_beta2: 0.98
optimizer_class: adamw
pin_memory: true
precision: bf16-mixed
save_topk: 10
seed: 1234
shuffle: true
sort_by_len: true
src_sample_rate: 16000
strategy: ddp
tensorboard_dir: ''
test_num: 100
tgt_sample_rate: 24000
timescale: 240000
use_cfg: false
use_cfg_rescale: false
use_chunk_setting: true
use_distributed_sampler: false
val_check_interval: 2000
vocoder_ckpt: ''
vocoder_config_path: ''
wandb_name: ''
warmup_updates: 2000
weight_decay: 0.0001
work_dir: ''