{ "model_type": "diffusion_cond_inpaint", "sample_size": 5324800, "sample_rate": 44100, "audio_channels": 2, "model": { "pretransform": { "type": "autoencoder", "iterate_batch": false, "chunked": true, "enable_grad": false, "config": { "pretransform": { "type": "patched", "enable_grad": false, "config": { "patch_size": 256, "channels": 2 } }, "encoder": { "type": "taae_v2", "requires_grad": false, "config": { "in_channels": 512, "channels": 128, "c_mults": [ 6 ], "strides": [ 16 ], "latent_dim": 256, "transformer_depths": [ 6 ], "checkpointing": false, "differential": true, "mapping_style": "none", "dim_heads": 64, "variable_stride": true, "use_flash": true, "dyt": true, "chunk_size": 32, "chunk_midpoint_shift": true, "conv_mapping": false, "mask_noise": 0.0 } }, "decoder": { "type": "taae_v2", "requires_grad": false, "config": { "out_channels": 512, "channels": 128, "c_mults": [ 6 ], "strides": [ 16 ], "latent_dim": 256, "transformer_depths": [ 6 ], "sinusoidal_blocks": [ 0 ], "checkpointing": false, "differential": true, "mapping_style": "none", "dim_heads": 64, "variable_stride": true, "use_flash": true, "dyt": true, "chunk_size": 32, "conv_mapping": true, "chunk_midpoint_shift": true, "freeze_backbone": false, "mask_noise": 0.01 } }, "bottleneck": { "type": "softnorm", "requires_grad": false, "config": { "dim": 256, "noise_augment_dim": 0, "noise_regularize": true, "auto_scale": true, "freeze": true } }, "latent_dim": 256, "downsampling_ratio": 4096, "io_channels": 2 } }, "conditioning": { "configs": [ { "id": "prompt", "type": "t5gemma", "config": { "max_length": 256, "padding_mode": "learned", "repo_id": "stabilityai/stable-audio-3-small-music", "subfolder": "t5gemma-b-b-ul2" } }, { "id": "seconds_total", "type": "number", "config": { "min_val": 0, "max_val": 384, "fourier_features_type": "expo" } } ], "cond_dim": 768 }, "diffusion": { "cross_attention_cond_ids": [ "prompt", "seconds_total" ], "global_cond_ids": [ "seconds_total" ], "local_add_cond_ids": [ "inpaint_mask", "inpaint_masked_input" ], "type": "dit", "diffusion_objective": "rectified_flow", "use_effective_length_for_schedule": true, "mask_padding_attention": true, "distribution_shift_options": { "type": "full", "min_length": 256, "max_length": 4096 }, "config": { "io_channels": 256, "embed_dim": 1024, "depth": 20, "num_heads": 16, "cond_token_dim": 768, "global_cond_dim": 768, "local_add_cond_dim": 257, "global_cond_type": "adaLN", "timestep_features_type": "expo", "timestep_features_logsnr": false, "attn_kwargs": { "qk_norm": "rms", "differential": false }, "norm_type": "rms_norm", "norm_kwargs": { "force_fp32": true }, "ff_kwargs": { "mult": 4.0 }, "num_memory_tokens": 64 } }, "io_channels": 256 }, "training": { "use_ema": true, "log_loss_info": false, "pre_encoded": true, "timestep_sampler": "trunc_logit_normal", "mask_loss_weight": 1.0, "log_every_n_steps": 100, "silence_extension_scale_seconds": 4.0, "ot_coupling": true, "inpainting": { "mask_kwargs": { "mask_type_probabilities": [ 0.2, 0.6, 0.2 ] } }, "optimizer_configs": { "diffusion": { "optimizer": { "type": "MuonAdamW", "config": { "muon_lr": 0.001, "muon_momentum": 0.95, "fused_layer_patterns": [ "*.to_qkv.*", "*.to_kv.*", "*.to_q.*", "*.ff.*.proj.*" ], "adam_lr": 5e-05, "adam_betas": [ 0.9, 0.95 ], "adam_weight_decay": 0.01 } }, "scheduler": { "type": "InverseLR", "config": { "inv_gamma": 1000000, "power": 0.5, "warmup": 0.995 } } } }, "demo": { "demo_every": 500, "demo_steps": 50, "demo_cfg_scales": [ 2, 4, 7 ], "demo_cond": [ { "prompt": "A beautiful piano arpeggio grows into a grand cinematic climax", "seconds_total": 119 }, { "prompt": "Elegant and sophisticated Latin jazz piece with a Cuban base and a whispered melodic female voice", "seconds_total": 100 }, { "prompt": "Amen break 174 BPM", "seconds_total": 9 }, { "prompt": "lofi house loop", "seconds_total": 35 } ], "inpaint_demos": { "num_random_spans": 2, "num_causal": 1 }, "log_snr_sampling": true } } }