| { |
| "model_type": "diffusion_cond_inpaint", |
| "sample_size": 16777216, |
| "sample_rate": 44100, |
| "audio_channels": 2, |
| "model": { |
| "pretransform": { |
| "type": "autoencoder", |
| "iterate_batch": false, |
| "chunked": true, |
| "config": { |
| "pretransform": { |
| "type": "patched", |
| "config": { |
| "patch_size": 256, |
| "channels": 2 |
| } |
| }, |
| "encoder": { |
| "type": "taae_v2", |
| "requires_grad": false, |
| "config": { |
| "in_channels": 512, |
| "channels": 256, |
| "c_mults": [ |
| 6 |
| ], |
| "strides": [ |
| 16 |
| ], |
| "latent_dim": 256, |
| "transformer_depths": [ |
| 12 |
| ], |
| "use_snake": false, |
| "use_dilated_conv": false, |
| "checkpointing": true, |
| "conformer": false, |
| "layer_scale": false, |
| "differential": true, |
| "conv_bias": false, |
| "mapping_style": "none", |
| "dim_heads": 64, |
| "enable_inner_layer_dropout": false, |
| "sliding_window": [ |
| 1, |
| 1 |
| ], |
| "variable_stride": true, |
| "use_flash": true, |
| "mask_noise": 0.001 |
| } |
| }, |
| "decoder": { |
| "type": "taae_v2", |
| "requires_grad": false, |
| "config": { |
| "out_channels": 512, |
| "channels": 256, |
| "c_mults": [ |
| 6 |
| ], |
| "strides": [ |
| 16 |
| ], |
| "latent_dim": 256, |
| "transformer_depths": [ |
| 12 |
| ], |
| "sinusoidal_blocks": [ |
| 8 |
| ], |
| "use_snake": false, |
| "use_dilated_conv": false, |
| "checkpointing": false, |
| "conformer": false, |
| "layer_scale": false, |
| "differential": true, |
| "conv_bias": false, |
| "mapping_style": "none", |
| "dim_heads": 64, |
| "enable_inner_layer_dropout": false, |
| "sliding_window": [ |
| 1, |
| 1 |
| ], |
| "variable_stride": true, |
| "use_flash": true, |
| "mask_noise": 0.1 |
| } |
| }, |
| "bottleneck": { |
| "type": "softnorm", |
| "config": { |
| "dim": 256, |
| "noise_augment_dim": 0, |
| "noise_regularize": true, |
| "auto_scale": true |
| } |
| }, |
| "latent_dim": 256, |
| "downsampling_ratio": 4096, |
| "io_channels": 2 |
| } |
| }, |
| "conditioning": { |
| "configs": [ |
| { |
| "id": "prompt", |
| "type": "t5gemma", |
| "config": { |
| "max_length": 256, |
| "padding_mode": "learned", |
| "repo_id": "stabilityai/stable-audio-3-medium", |
| "subfolder": "t5gemma-b-b-ul2" |
| } |
| }, |
| { |
| "id": "seconds_total", |
| "type": "number", |
| "config": { |
| "min_val": 0, |
| "max_val": 384, |
| "fourier_features_type": "expo" |
| } |
| } |
| ], |
| "cond_dim": 768 |
| }, |
| "diffusion": { |
| "cross_attention_cond_ids": [ |
| "prompt", |
| "seconds_total" |
| ], |
| "global_cond_ids": [ |
| "seconds_total" |
| ], |
| "local_add_cond_ids": [ |
| "inpaint_mask", |
| "inpaint_masked_input" |
| ], |
| "type": "dit", |
| "diffusion_objective": "rf_denoiser", |
| "mask_padding_attention": true, |
| "use_effective_length_for_schedule": true, |
| "distribution_shift_options": { |
| "min_length": 256, |
| "max_length": 4096 |
| }, |
| "config": { |
| "io_channels": 256, |
| "embed_dim": 1536, |
| "depth": 24, |
| "num_heads": 24, |
| "cond_token_dim": 768, |
| "global_cond_dim": 768, |
| "local_add_cond_dim": 257, |
| "global_cond_type": "adaLN", |
| "timestep_features_type": "expo", |
| "attn_kwargs": { |
| "qk_norm": "rms", |
| "differential": true |
| }, |
| "norm_type": "rms_norm", |
| "norm_kwargs": { |
| "force_fp32": true |
| }, |
| "ff_kwargs": { |
| "mult": 4.0 |
| }, |
| "num_memory_tokens": 64 |
| } |
| }, |
| "io_channels": 256 |
| }, |
| "training": { |
| "use_ema": true, |
| "log_loss_info": false, |
| "pre_encoded": true, |
| "ot_coupling": true, |
| "silence_extension_scale_seconds": 4.0, |
| "timestep_sampler": "trunc_logit_normal", |
| "mask_loss_weight": 1.0, |
| "cfg_dropout_prob": 0.1, |
| "inpainting": { |
| "mask_kwargs": { |
| "mask_type_probabilities": [ |
| 0.1, |
| 0.8, |
| 0.1 |
| ] |
| } |
| }, |
| "arc": { |
| "noise_dist": { |
| "generator": "trunc_logit_normal", |
| "discriminator": "logit_normal" |
| }, |
| "disc_update_interval": 2, |
| "use_model_as_discriminator": true, |
| "discriminator_base_ckpt": "/path/to/discriminator/ckpt.pt", |
| "discriminator": { |
| "type": "dilated_conv", |
| "dit_hidden_layer": [ |
| 18 |
| ], |
| "weights": { |
| "generator": 1.0, |
| "discriminator": 1.0 |
| }, |
| "reset_every": 250, |
| "loss_type": "relativistic", |
| "config": { |
| "hidden_dim": 1024, |
| "dilations": [ |
| 1, |
| 1, |
| 1, |
| 1, |
| 1 |
| ] |
| }, |
| "disc_hinge_loss": false, |
| "contrastive": true, |
| "include_grad_penalties": false |
| } |
| }, |
| "optimizer_configs": { |
| "diffusion": { |
| "optimizer": { |
| "type": "MuonAdamW", |
| "config": { |
| "muon_lr": 1e-05, |
| "muon_momentum": 0.95, |
| "adam_lr": 1e-06, |
| "adam_betas": [ |
| 0.9, |
| 0.95 |
| ], |
| "adam_weight_decay": 0.01, |
| "fused_layer_patterns": [ |
| "*.to_qkv.*", |
| "*.to_kv.*", |
| "*.to_q.*", |
| "*.ff.*.proj.*" |
| ] |
| } |
| }, |
| "scheduler": { |
| "type": "InverseLR", |
| "config": { |
| "inv_gamma": 1000000, |
| "power": 0.5, |
| "warmup": 0.95 |
| } |
| } |
| }, |
| "discriminator": { |
| "optimizer": { |
| "type": "MuonAdamW", |
| "config": { |
| "muon_lr": 1e-05, |
| "muon_momentum": 0.95, |
| "adam_lr": 1e-06, |
| "adam_betas": [ |
| 0.9, |
| 0.95 |
| ], |
| "adam_weight_decay": 0.01, |
| "fused_layer_patterns": [ |
| "*.to_qkv.*", |
| "*.to_kv.*", |
| "*.to_q.*", |
| "*.ff.*.proj.*" |
| ] |
| } |
| }, |
| "scheduler": { |
| "type": "InverseLR", |
| "config": { |
| "inv_gamma": 1000000, |
| "power": 0.5, |
| "warmup": 0.9 |
| } |
| } |
| } |
| }, |
| "demo": { |
| "demo_every": 500, |
| "demo_steps": 8, |
| "num_demos": 2, |
| "demo_cond": [ |
| { |
| "prompt": "Meditative lo-fi ambient piano jazz, soft acoustic drum kit", |
| "seconds_total": 190 |
| }, |
| { |
| "prompt": "A tropical house track with upbeat melodies, a driving bassline, and cheery vibes", |
| "seconds_total": 180 |
| } |
| ], |
| "demo_cfg_scales": [ |
| 1 |
| ] |
| } |
| } |
| } |