| model_name: EzAudio-L-Energy
|
|
|
| model:
|
| mae: True
|
| mae_prob: 0.25
|
| mask_ratio: [0.25, 1.0]
|
| mask_span: 10
|
| img_size: 500
|
| patch_size: 1
|
| in_chans: 257
|
| out_chans: 128
|
| input_type: '1d'
|
| embed_dim: 1024
|
| depth: 24
|
| num_heads: 16
|
| mlp_ratio: 4.0
|
| qkv_bias: false
|
| qk_scale: null
|
| qk_norm: layernorm
|
| norm_layer: layernorm
|
| act_layer: geglu
|
| context_norm: true
|
| use_checkpoint: true
|
| time_fusion: 'ada_sola_bias'
|
| ada_lora_rank: 32
|
| ada_lora_alpha: 32
|
| cls_dim: null
|
| context_dim: 1024
|
| context_fusion: 'cross'
|
| context_max_length: null
|
| context_pe_method: 'none'
|
| pe_method: 'none'
|
| rope_mode: 'shared'
|
| use_conv: true
|
| skip: true
|
| skip_norm: true
|
|
|
| controlnet:
|
| cond_in: 1
|
| cond_blocks: [64, 128]
|
| cond_mask: true
|
| cond_mask_prob: 0.25
|
| cond_mask_ratio: [0.25, 0.50]
|
| cond_mask_span: 10
|
|
|
| conditioner:
|
| condition_type: energy
|
| hop_size: 240
|
| window_size: 1920
|
| padding: 'reflect'
|
| min_db: -60
|
| norm: True
|
|
|
|
|
| autoencoder:
|
| name: stable_vae
|
| dim: 128
|
| sr: 24000
|
| latent_sr: 50
|
| q_first: true
|
| scale: 1.0
|
| shift: 0.0
|
|
|
|
|
|
|
| text_encoder:
|
| model: google/flan-t5-large
|
| max_length: 100
|
| cfg: 0.1
|
|
|
| diff:
|
| num_train_timesteps: 1000
|
| beta_schedule: 'scaled_linear'
|
| beta_start: 0.00085
|
| beta_end: 0.012
|
| prediction_type: 'v_prediction'
|
| rescale_betas_zero_snr: true
|
| timestep_spacing: 'trailing'
|
| clip_sample: false |