| audio:
|
| chunk_size: 352800
|
| dim_f: 1024
|
| dim_t: 801
|
| hop_length: 441
|
| n_fft: 2048
|
| num_channels: 2
|
| sample_rate: 44100
|
| min_mean_abs: 0.000
|
|
|
| model:
|
| dim: 512
|
| depth: 12
|
| stereo: true
|
| num_stems: 1
|
| time_transformer_depth: 1
|
| freq_transformer_depth: 1
|
| linear_transformer_depth: 0
|
| freqs_per_bands: !!python/tuple
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 2
|
| - 4
|
| - 4
|
| - 4
|
| - 4
|
| - 4
|
| - 4
|
| - 4
|
| - 4
|
| - 4
|
| - 4
|
| - 4
|
| - 4
|
| - 12
|
| - 12
|
| - 12
|
| - 12
|
| - 12
|
| - 12
|
| - 12
|
| - 12
|
| - 24
|
| - 24
|
| - 24
|
| - 24
|
| - 24
|
| - 24
|
| - 24
|
| - 24
|
| - 48
|
| - 48
|
| - 48
|
| - 48
|
| - 48
|
| - 48
|
| - 48
|
| - 48
|
| - 128
|
| - 129
|
| dim_head: 64
|
| heads: 8
|
| attn_dropout: 0.1
|
| ff_dropout: 0.1
|
| flash_attn: true
|
| dim_freqs_in: 1025
|
| stft_n_fft: 2048
|
| stft_hop_length: 441
|
| stft_win_length: 2048
|
| stft_normalized: false
|
| mask_estimator_depth: 2
|
| multi_stft_resolution_loss_weight: 1.0
|
| multi_stft_resolutions_window_sizes: !!python/tuple
|
| - 4096
|
| - 2048
|
| - 1024
|
| - 512
|
| - 256
|
| multi_stft_hop_size: 147
|
| multi_stft_normalized: False
|
|
|
| training:
|
| batch_size: 2
|
| gradient_accumulation_steps: 1
|
| grad_clip: 0
|
| instruments:
|
| - vocals
|
| - other
|
| lr: 1.0e-05
|
| patience: 2
|
| reduce_factor: 0.95
|
| target_instrument: vocals
|
| num_epochs: 1000
|
| num_steps: 1000
|
| q: 0.95
|
| coarse_loss_clip: true
|
| ema_momentum: 0.999
|
| optimizer: adam
|
| other_fix: true
|
| use_amp: true
|
|
|
| inference:
|
| batch_size: 4
|
| dim_t: 801
|
| num_overlap: 2 |