ShandaAI
/

FlowSep-hive

sound-separation

Model card Files Files and versions

FlowSep-hive / config.yaml

JusperLee's picture

Update config.yaml

7af3360 verified about 1 month ago

history blame contribute delete

3.4 kB

	metadata_root: "models/FlowSep/metadata-master/processed/dataset_root.json"
	log_directory: "models/FlowSep/model_logs_curationed"
	exp_group: "lass"
	exp_name: "2channel_flow"
	project: "FlowSep"

	data:
	train: ["audiocaps"]
	val: "audiocaps"
	test: "audiocaps"
	mix_train: "train"
	class_label_indices: "audiocaps"
	dataloader_add_ons: []
	mix_audio: true
	random_empty: 0.0001

	step:
	validation_every_n_epochs: 1
	save_checkpoint_every_n_steps: 100000
	max_steps: 4000000
	save_top_k: 4

	preprocessing:
	audio:
	sampling_rate: 16000
	max_wav_value: 32768.0
	duration: 10.24
	stft:
	filter_length: 1024
	hop_length: 160
	win_length: 1024
	mel:
	n_mel_channels: 64
	mel_fmin: 0
	mel_fmax: 8000

	augmentation:
	mixup: 0.0

	model:
	target: latent_diffusion.models.ddpm_flow.LatentDiffusion
	params:
	base_learning_rate: 5.0e-05
	sampling_rate: 16000
	batchsize: 8
	linear_start: 0.0015
	linear_end: 0.0195
	num_timesteps_cond: 1
	log_every_t: 200
	timesteps: 1000
	unconditional_prob_cfg: 0.1
	parameterization: eps # [eps, x0, v]
	first_stage_key: fbank
	latent_t_size: 256 # TODO might need to change
	latent_f_size: 16
	channels: 8 # TODO might need to change
	extra_channels: true
	extra_channel_key: mixed_mel
	monitor: val/loss_simple_ema
	scale_by_std: true
	clap_trainable: false
	retrival_num: 0
	use_clap: false
	euler: true
	unet_config:
	target: latent_diffusion.modules.diffusionmodules.openaimodel.UNetModel
	params:
	image_size: 64 # Ignore this parameter
	context_dim:
	- 1024
	in_channels: 16 # The input channel of the UNet model
	out_channels: 16 # TODO might need to change
	model_channels: 128 # TODO might need to change
	attention_resolutions:
	- 8
	- 4
	- 2
	num_res_blocks: 2
	channel_mult:
	- 1
	- 2
	- 3
	- 5
	num_head_channels: 32
	use_spatial_transformer: true
	transformer_depth: 1
	first_stage_config:
	base_learning_rate: 4.5e-05
	target: latent_encoder.autoencoder.AutoencoderKL
	params:
	# reload_from_ckpt: "model_logs/pretrained/vae.ckpt"
	reload_from_ckpt: "vae.ckpt"
	batchsize: 2
	monitor: val/rec_loss
	image_key: fbank
	subband: 1
	embed_dim: 8
	time_shuffle: 1
	lossconfig:
	target: latent_diffusion.modules.losses.LPIPSWithDiscriminator
	params:
	disc_start: 50001
	kl_weight: 1.0
	disc_weight: 0.5
	disc_in_channels: 1
	ddconfig:
	double_z: true
	z_channels: 8
	resolution: 256
	mel_bins: 64
	downsample_time: false
	in_channels: 1
	out_ch: 1
	ch: 128
	ch_mult:
	- 1
	- 2
	- 4
	num_res_blocks: 2
	attn_resolutions: []
	dropout: 0.0
	cond_stage_config:
	crossattn_text:
	cond_stage_key: caption
	conditioning_key: crossattn
	target: latent_diffusion.modules.encoders.modules.FlanT5HiddenState
	params:
	emb_num: 1
	input_caption: true


	evaluation_params:
	unconditional_guidance_scale: 1.0 #
	ddim_sampling_steps: 10
	n_candidates_per_samples: 1