| |
| lyric_processor: |
| max_dur: 150 |
| min_dur: 30 |
| prompt_len: 10 |
| pad_to_max: true |
|
|
|
|
| |
| audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors |
| audio_tokenizer_frame_rate: 25 |
| audio_tokenizer_code_depth: 1 |
| sample_rate: 48000 |
|
|
| audio_tokenizer_checkpoint_sep: Flow1dVAESeparate_./ckpt/model_septoken/model_2.safetensors |
| audio_tokenizer_frame_rate_sep: 25 |
| audio_tokenizer_code_depth_sep: 2 |
| sample_rate_sep: 48000 |
|
|
| |
| vae_config: ./ckpt/vae/stable_audio_1920_vae.json |
| vae_model: ./ckpt/vae/autoencoder_music_1320k.ckpt |
|
|
| |
| lm: |
| lm_type: Llama |
| dim: 1536 |
| intermediate_size: 8960 |
| num_heads: 12 |
| num_layers: 28 |
| num_layers_sub: 12 |
| code_depth: 3 |
| code_size: 16384 |
| max_position_embeddings: 8196 |
| max_position_embeddings_sub: 10000 |
| rope_theta: 100000.0 |
| rope_theta_sub: 500000.0 |
| dropout: 0.0 |
| use_flash_attn_2: true |
| activation: gelu |
| norm_first: true |
| bias_ff: false |
| bias_attn: false |
| causal: true |
| custom: false |
| memory_efficient: true |
| attention_as_float32: false |
| layer_scale: null |
| positional_embedding: sin |
| xpos: false |
| checkpointing: torch |
| weight_init: gaussian |
| depthwise_init: current |
| zero_bias_init: true |
| norm: layer_norm |
| cross_attention: false |
| qk_layer_norm: false |
| qk_layer_norm_cross: false |
| attention_dropout: null |
| kv_repeat: 1 |
|
|
| codebooks_pattern: |
| modeling: delay |
| delay: |
| delays: [ 0, 250, 250 ] |
| flatten_first: 0 |
| empty_initial: 0 |
|
|
| |
| classifier_free_guidance: |
| |
| training_dropout: 0.15 |
| inference_coef: 1.5 |
|
|
| attribute_dropout: |
| |
| args: |
| active_on_eval: false |
| text: |
| description: 0.0 |
| type_info: 0.5 |
| audio: |
| prompt_audio: 0.0 |
|
|
|
|
| use_text_training: True |
| fuser: |
| sum: [] |
| prepend: [ description, prompt_audio, type_info ] |
|
|
| conditioners: |
| prompt_audio: |
| model: qt_embedding |
| qt_embedding: |
| code_size: 16384 |
| code_depth: 3 |
| max_len: ${eval:${prompt_len}*${audio_tokenizer_frame_rate}+2} |
| description: |
| model: QwTokenizer |
| QwTokenizer: |
| token_path: third_party/Qwen2-7B |
| max_len: 300 |
| add_token_list: ${load_yaml:conf/vocab.yaml} |
| type_info: |
| model: QwTextTokenizer |
| QwTextTokenizer: |
| token_path: third_party/Qwen2-7B |
| max_len: 50 |
|
|
| offload: |
| audiolm: |
| offload_module: self |
| cpu_mem_gb: 0 |
| pre_copy_step: 1 |
| clean_cache_after_forward: false |
| dtype: torch.float16 |
| offload_layer_dict: |
| transformer: 4 |
| transformer2: 4 |
| ignore_layer_list: [] |
| clean_cache_wrapper: |
| module: self |
| method_name: _sample_next_token |
| diff_mem_gb_thre: 2 |
| debug: false |
|
|
| wav_tokenizer_diffusion: |
| offload_module: self.model.model |
| pre_copy_step: 1 |
| clean_cache_after_forward: false |
| cpu_mem_gb: -1 |
| dtype: null |
| offload_layer_dict: |
| cfm_wrapper: 5 |
| hubert: 4 |
| ignore_layer_list: [] |
| clean_cache_wrapper: |
| module: self.model.model.cfm_wrapper.estimator |
| method_name: forward |
| diff_mem_gb_thre: 1 |
| debug: false |
|
|