{ "model_type": "diffusion_cond_inpaint", "sample_size": 16777216, "sample_rate": 44100, "audio_channels": 2, "model": { "pretransform": { "type": "autoencoder", "iterate_batch": false, "chunked": true, "config": { "pretransform": { "type": "patched", "config": { "patch_size": 256, "channels": 2 } }, "encoder": { "type": "taae_v2", "requires_grad": false, "config": { "in_channels": 512, "channels": 256, "c_mults": [ 6 ], "strides": [ 16 ], "latent_dim": 256, "transformer_depths": [ 12 ], "use_snake": false, "use_dilated_conv": false, "checkpointing": true, "conformer": false, "layer_scale": false, "differential": true, "conv_bias": false, "mapping_style": "none", "dim_heads": 64, "enable_inner_layer_dropout": false, "sliding_window": [ 1, 1 ], "variable_stride": true, "use_flash": true, "mask_noise": 0.001 } }, "decoder": { "type": "taae_v2", "requires_grad": false, "config": { "out_channels": 512, "channels": 256, "c_mults": [ 6 ], "strides": [ 16 ], "latent_dim": 256, "transformer_depths": [ 12 ], "sinusoidal_blocks": [ 8 ], "use_snake": false, "use_dilated_conv": false, "checkpointing": false, "conformer": false, "layer_scale": false, "differential": true, "conv_bias": false, "mapping_style": "none", "dim_heads": 64, "enable_inner_layer_dropout": false, "sliding_window": [ 1, 1 ], "variable_stride": true, "use_flash": true, "mask_noise": 0.1 } }, "bottleneck": { "type": "softnorm", "config": { "dim": 256, "noise_augment_dim": 0, "noise_regularize": true, "auto_scale": true } }, "latent_dim": 256, "downsampling_ratio": 4096, "io_channels": 2 } }, "conditioning": { "configs": [ { "id": "prompt", "type": "t5gemma", "config": { "max_length": 256, "padding_mode": "learned", "repo_id": "stabilityai/stable-audio-3-medium", "subfolder": "t5gemma-b-b-ul2" } }, { "id": "seconds_total", "type": "number", "config": { "min_val": 0, "max_val": 384, "fourier_features_type": "expo" } } ], "cond_dim": 768 }, "diffusion": { "cross_attention_cond_ids": [ "prompt", "seconds_total" ], "global_cond_ids": [ "seconds_total" ], "local_add_cond_ids": [ "inpaint_mask", "inpaint_masked_input" ], "type": "dit", "diffusion_objective": "rf_denoiser", "mask_padding_attention": true, "use_effective_length_for_schedule": true, "distribution_shift_options": { "min_length": 256, "max_length": 4096 }, "config": { "io_channels": 256, "embed_dim": 1536, "depth": 24, "num_heads": 24, "cond_token_dim": 768, "global_cond_dim": 768, "local_add_cond_dim": 257, "global_cond_type": "adaLN", "timestep_features_type": "expo", "attn_kwargs": { "qk_norm": "rms", "differential": true }, "norm_type": "rms_norm", "norm_kwargs": { "force_fp32": true }, "ff_kwargs": { "mult": 4.0 }, "num_memory_tokens": 64 } }, "io_channels": 256 }, "training": { "use_ema": true, "log_loss_info": false, "pre_encoded": true, "ot_coupling": true, "silence_extension_scale_seconds": 4.0, "timestep_sampler": "trunc_logit_normal", "mask_loss_weight": 1.0, "cfg_dropout_prob": 0.1, "inpainting": { "mask_kwargs": { "mask_type_probabilities": [ 0.1, 0.8, 0.1 ] } }, "arc": { "noise_dist": { "generator": "trunc_logit_normal", "discriminator": "logit_normal" }, "disc_update_interval": 2, "use_model_as_discriminator": true, "discriminator_base_ckpt": "/path/to/discriminator/ckpt.pt", "discriminator": { "type": "dilated_conv", "dit_hidden_layer": [ 18 ], "weights": { "generator": 1.0, "discriminator": 1.0 }, "reset_every": 250, "loss_type": "relativistic", "config": { "hidden_dim": 1024, "dilations": [ 1, 1, 1, 1, 1 ] }, "disc_hinge_loss": false, "contrastive": true, "include_grad_penalties": false } }, "optimizer_configs": { "diffusion": { "optimizer": { "type": "MuonAdamW", "config": { "muon_lr": 1e-05, "muon_momentum": 0.95, "adam_lr": 1e-06, "adam_betas": [ 0.9, 0.95 ], "adam_weight_decay": 0.01, "fused_layer_patterns": [ "*.to_qkv.*", "*.to_kv.*", "*.to_q.*", "*.ff.*.proj.*" ] } }, "scheduler": { "type": "InverseLR", "config": { "inv_gamma": 1000000, "power": 0.5, "warmup": 0.95 } } }, "discriminator": { "optimizer": { "type": "MuonAdamW", "config": { "muon_lr": 1e-05, "muon_momentum": 0.95, "adam_lr": 1e-06, "adam_betas": [ 0.9, 0.95 ], "adam_weight_decay": 0.01, "fused_layer_patterns": [ "*.to_qkv.*", "*.to_kv.*", "*.to_q.*", "*.ff.*.proj.*" ] } }, "scheduler": { "type": "InverseLR", "config": { "inv_gamma": 1000000, "power": 0.5, "warmup": 0.9 } } } }, "demo": { "demo_every": 500, "demo_steps": 8, "num_demos": 2, "demo_cond": [ { "prompt": "Meditative lo-fi ambient piano jazz, soft acoustic drum kit", "seconds_total": 190 }, { "prompt": "A tropical house track with upbeat melodies, a driving bassline, and cheery vibes", "seconds_total": 180 } ], "demo_cfg_scales": [ 1 ] } } }