stable-audio-3-small-sfx-base / model_config.json
mattricesound's picture
Initial commit
1fd03d2
{
"model_type": "diffusion_cond_inpaint",
"sample_size": 5324800,
"sample_rate": 44100,
"audio_channels": 2,
"model": {
"pretransform": {
"type": "autoencoder",
"iterate_batch": false,
"chunked": true,
"enable_grad": false,
"scale": 1.0,
"config": {
"pretransform": {
"type": "patched",
"enable_grad": false,
"config": {
"patch_size": 256,
"channels": 2
}
},
"encoder": {
"type": "taae_v2",
"requires_grad": false,
"config": {
"in_channels": 512,
"channels": 128,
"c_mults": [
6
],
"strides": [
16
],
"latent_dim": 256,
"transformer_depths": [
6
],
"checkpointing": false,
"differential": true,
"mapping_style": "none",
"dim_heads": 64,
"variable_stride": true,
"use_flash": true,
"dyt": true,
"chunk_size": 32,
"chunk_midpoint_shift": true,
"conv_mapping": false,
"mask_noise": 0.0
}
},
"decoder": {
"type": "taae_v2",
"requires_grad": false,
"config": {
"out_channels": 512,
"channels": 128,
"c_mults": [
6
],
"strides": [
16
],
"latent_dim": 256,
"transformer_depths": [
6
],
"sinusoidal_blocks": [
0
],
"checkpointing": false,
"differential": true,
"mapping_style": "none",
"dim_heads": 64,
"variable_stride": true,
"use_flash": true,
"dyt": true,
"chunk_size": 32,
"conv_mapping": true,
"chunk_midpoint_shift": true,
"freeze_backbone": false,
"mask_noise": 0.01
}
},
"bottleneck": {
"type": "softnorm",
"requires_grad": false,
"config": {
"dim": 256,
"noise_augment_dim": 0,
"noise_regularize": true,
"auto_scale": true,
"freeze": true
}
},
"latent_dim": 256,
"downsampling_ratio": 4096,
"io_channels": 2
}
},
"conditioning": {
"configs": [
{
"id": "prompt",
"type": "t5gemma",
"config": {
"max_length": 256,
"padding_mode": "learned",
"repo_id": "stabilityai/stable-audio-3-small-sfx",
"subfolder": "t5gemma-b-b-ul2"
}
},
{
"id": "seconds_total",
"type": "number",
"config": {
"min_val": 0,
"max_val": 384,
"fourier_features_type": "expo"
}
}
],
"cond_dim": 768
},
"diffusion": {
"cross_attention_cond_ids": [
"prompt",
"seconds_total"
],
"global_cond_ids": [
"seconds_total"
],
"local_add_cond_ids": [
"inpaint_mask",
"inpaint_masked_input"
],
"type": "dit",
"diffusion_objective": "rectified_flow",
"use_effective_length_for_schedule": true,
"mask_padding_attention": true,
"distribution_shift_options": {
"type": "full",
"min_length": 256,
"max_length": 4096
},
"config": {
"io_channels": 256,
"embed_dim": 1024,
"depth": 20,
"num_heads": 16,
"cond_token_dim": 768,
"global_cond_dim": 768,
"local_add_cond_dim": 257,
"global_cond_type": "adaLN",
"timestep_features_type": "expo",
"timestep_features_logsnr": false,
"attn_kwargs": {
"qk_norm": "rms",
"differential": false
},
"norm_type": "rms_norm",
"norm_kwargs": {
"force_fp32": true
},
"ff_kwargs": {
"mult": 4.0
},
"num_memory_tokens": 64
}
},
"io_channels": 256
},
"training": {
"use_ema": true,
"log_loss_info": false,
"pre_encoded": true,
"timestep_sampler": "trunc_logit_normal",
"mask_loss_weight": 1.0,
"log_every_n_steps": 100,
"silence_extension_scale_seconds": 4.0,
"ot_coupling": true,
"inpainting": {
"mask_kwargs": {
"mask_type_probabilities": [
0.2,
0.6,
0.2
]
}
},
"optimizer_configs": {
"diffusion": {
"optimizer": {
"type": "MuonAdamW",
"config": {
"muon_lr": 0.001,
"muon_momentum": 0.95,
"fused_layer_patterns": [
"*.to_qkv.*",
"*.to_kv.*",
"*.to_q.*",
"*.ff.*.proj.*"
],
"adam_lr": 5e-05,
"adam_betas": [
0.9,
0.95
],
"adam_weight_decay": 0.01
}
},
"scheduler": {
"type": "InverseLR",
"config": {
"inv_gamma": 1000000,
"power": 0.5,
"warmup": 0.995
}
}
}
},
"demo": {
"demo_every": 500,
"demo_steps": 50,
"demo_cfg_scales": [
2,
4,
7
],
"demo_cond": [
{
"prompt": "A beautiful piano arpeggio grows into a grand cinematic climax",
"seconds_total": 119
},
{
"prompt": "Elegant and sophisticated Latin jazz piece with a Cuban base and a whispered melodic female voice",
"seconds_total": 100
},
{
"prompt": "Amen break 174 BPM",
"seconds_total": 9
},
{
"prompt": "lofi house loop",
"seconds_total": 35
}
],
"inpaint_demos": {
"num_random_spans": 2,
"num_causal": 1
},
"log_snr_sampling": true
}
}
}