stable-audio-3-medium-base / model_config.json
mattricesound's picture
Fix chunked/iterate_batch inference defaults
d709234 verified
{
"model_type": "diffusion_cond_inpaint",
"sample_size": 16777216,
"sample_rate": 44100,
"audio_channels": 2,
"model": {
"pretransform": {
"type": "autoencoder",
"iterate_batch": false,
"chunked": true,
"config": {
"pretransform": {
"type": "patched",
"config": {
"patch_size": 256,
"channels": 2
}
},
"encoder": {
"type": "taae_v2",
"requires_grad": false,
"config": {
"in_channels": 512,
"channels": 256,
"c_mults": [
6
],
"strides": [
16
],
"latent_dim": 256,
"transformer_depths": [
12
],
"use_snake": false,
"use_dilated_conv": false,
"checkpointing": true,
"conformer": false,
"layer_scale": false,
"differential": true,
"conv_bias": false,
"mapping_style": "none",
"dim_heads": 64,
"enable_inner_layer_dropout": false,
"sliding_window": [
1,
1
],
"variable_stride": true,
"use_flash": true,
"mask_noise": 0.001
}
},
"decoder": {
"type": "taae_v2",
"requires_grad": false,
"config": {
"out_channels": 512,
"channels": 256,
"c_mults": [
6
],
"strides": [
16
],
"latent_dim": 256,
"transformer_depths": [
12
],
"sinusoidal_blocks": [
8
],
"use_snake": false,
"use_dilated_conv": false,
"checkpointing": false,
"conformer": false,
"layer_scale": false,
"differential": true,
"conv_bias": false,
"mapping_style": "none",
"dim_heads": 64,
"enable_inner_layer_dropout": false,
"sliding_window": [
1,
1
],
"variable_stride": true,
"use_flash": true,
"mask_noise": 0.1
}
},
"bottleneck": {
"type": "softnorm",
"config": {
"dim": 256,
"noise_augment_dim": 0,
"noise_regularize": true,
"auto_scale": true
}
},
"latent_dim": 256,
"downsampling_ratio": 4096,
"io_channels": 2
}
},
"conditioning": {
"configs": [
{
"id": "prompt",
"type": "t5gemma",
"config": {
"max_length": 256,
"padding_mode": "learned",
"repo_id": "stabilityai/stable-audio-3-medium",
"subfolder": "t5gemma-b-b-ul2"
}
},
{
"id": "seconds_total",
"type": "number",
"config": {
"min_val": 0,
"max_val": 384,
"fourier_features_type": "expo"
}
}
],
"cond_dim": 768
},
"diffusion": {
"cross_attention_cond_ids": [
"prompt",
"seconds_total"
],
"global_cond_ids": [
"seconds_total"
],
"local_add_cond_ids": [
"inpaint_mask",
"inpaint_masked_input"
],
"type": "dit",
"diffusion_objective": "rectified_flow",
"mask_padding_attention": true,
"use_effective_length_for_schedule": true,
"distribution_shift_options": {
"min_length": 256,
"max_length": 4096
},
"config": {
"io_channels": 256,
"embed_dim": 1536,
"depth": 24,
"num_heads": 24,
"cond_token_dim": 768,
"global_cond_dim": 768,
"local_add_cond_dim": 257,
"global_cond_type": "adaLN",
"timestep_features_type": "expo",
"attn_kwargs": {
"qk_norm": "rms",
"differential": true
},
"norm_type": "rms_norm",
"norm_kwargs": {
"force_fp32": true
},
"ff_kwargs": {
"mult": 4.0
},
"num_memory_tokens": 64
}
},
"io_channels": 256
},
"training": {
"use_ema": true,
"log_loss_info": false,
"pre_encoded": true,
"ot_coupling": true,
"silence_extension_scale_seconds": 4.0,
"timestep_sampler": "trunc_logit_normal",
"mask_loss_weight": 1.0,
"inpainting": {
"mask_kwargs": {
"mask_type_probabilities": [
0.1,
0.8,
0.1
]
}
},
"optimizer_configs": {
"diffusion": {
"optimizer": {
"type": "MuonAdamW",
"config": {
"muon_lr": 0.001,
"muon_momentum": 0.95,
"adam_lr": 5e-05,
"adam_betas": [
0.9,
0.95
],
"adam_weight_decay": 0.01,
"fused_layer_patterns": [
"*.to_qkv.*",
"*.to_kv.*",
"*.to_q.*",
"*.ff.*.proj.*"
]
}
},
"scheduler": {
"type": "InverseLR",
"config": {
"inv_gamma": 1000000,
"power": 0.5,
"warmup": 0.995
}
}
}
},
"demo": {
"demo_every": 500,
"demo_steps": 50,
"num_demos": 4,
"demo_cfg_scales": [
2,
4,
7
]
}
}
}