| metadata_root: "models/FlowSep/metadata-master/processed/dataset_root.json" |
| log_directory: "models/FlowSep/model_logs_curationed" |
| exp_group: "lass" |
| exp_name: "2channel_flow" |
| project: "FlowSep" |
|
|
| data: |
| train: ["audiocaps"] |
| val: "audiocaps" |
| test: "audiocaps" |
| mix_train: "train" |
| class_label_indices: "audiocaps" |
| dataloader_add_ons: [] |
| mix_audio: true |
| random_empty: 0.0001 |
|
|
| step: |
| validation_every_n_epochs: 1 |
| save_checkpoint_every_n_steps: 100000 |
| max_steps: 4000000 |
| save_top_k: 4 |
|
|
| preprocessing: |
| audio: |
| sampling_rate: 16000 |
| max_wav_value: 32768.0 |
| duration: 10.24 |
| stft: |
| filter_length: 1024 |
| hop_length: 160 |
| win_length: 1024 |
| mel: |
| n_mel_channels: 64 |
| mel_fmin: 0 |
| mel_fmax: 8000 |
|
|
| augmentation: |
| mixup: 0.0 |
|
|
| model: |
| target: latent_diffusion.models.ddpm_flow.LatentDiffusion |
| params: |
| base_learning_rate: 5.0e-05 |
| sampling_rate: 16000 |
| batchsize: 8 |
| linear_start: 0.0015 |
| linear_end: 0.0195 |
| num_timesteps_cond: 1 |
| log_every_t: 200 |
| timesteps: 1000 |
| unconditional_prob_cfg: 0.1 |
| parameterization: eps |
| first_stage_key: fbank |
| latent_t_size: 256 |
| latent_f_size: 16 |
| channels: 8 |
| extra_channels: true |
| extra_channel_key: mixed_mel |
| monitor: val/loss_simple_ema |
| scale_by_std: true |
| clap_trainable: false |
| retrival_num: 0 |
| use_clap: false |
| euler: true |
| unet_config: |
| target: latent_diffusion.modules.diffusionmodules.openaimodel.UNetModel |
| params: |
| image_size: 64 |
| context_dim: |
| - 1024 |
| in_channels: 16 |
| out_channels: 16 |
| model_channels: 128 |
| attention_resolutions: |
| - 8 |
| - 4 |
| - 2 |
| num_res_blocks: 2 |
| channel_mult: |
| - 1 |
| - 2 |
| - 3 |
| - 5 |
| num_head_channels: 32 |
| use_spatial_transformer: true |
| transformer_depth: 1 |
| first_stage_config: |
| base_learning_rate: 4.5e-05 |
| target: latent_encoder.autoencoder.AutoencoderKL |
| params: |
| |
| reload_from_ckpt: "vae.ckpt" |
| batchsize: 2 |
| monitor: val/rec_loss |
| image_key: fbank |
| subband: 1 |
| embed_dim: 8 |
| time_shuffle: 1 |
| lossconfig: |
| target: latent_diffusion.modules.losses.LPIPSWithDiscriminator |
| params: |
| disc_start: 50001 |
| kl_weight: 1.0 |
| disc_weight: 0.5 |
| disc_in_channels: 1 |
| ddconfig: |
| double_z: true |
| z_channels: 8 |
| resolution: 256 |
| mel_bins: 64 |
| downsample_time: false |
| in_channels: 1 |
| out_ch: 1 |
| ch: 128 |
| ch_mult: |
| - 1 |
| - 2 |
| - 4 |
| num_res_blocks: 2 |
| attn_resolutions: [] |
| dropout: 0.0 |
| cond_stage_config: |
| crossattn_text: |
| cond_stage_key: caption |
| conditioning_key: crossattn |
| target: latent_diffusion.modules.encoders.modules.FlanT5HiddenState |
| params: |
| emb_num: 1 |
| input_caption: true |
|
|
|
|
| evaluation_params: |
| unconditional_guidance_scale: 1.0 |
| ddim_sampling_steps: 10 |
| n_candidates_per_samples: 1 |