stable-audio-3-mirrors / small-sfx /model_config.json
AEmotionStudio's picture
Mirror stabilityai/stable-audio-3-small-sfx → small-sfx/
5d369c2 verified
{
"model_type": "diffusion_cond_inpaint",
"sample_size": 5292032,
"sample_rate": 44100,
"audio_channels": 2,
"model": {
"pretransform": {
"type": "autoencoder",
"iterate_batch": false,
"chunked": true,
"enable_grad": false,
"config": {
"pretransform": {
"type": "patched",
"enable_grad": false,
"config": {
"patch_size": 256,
"channels": 2
}
},
"encoder": {
"type": "taae_v2",
"requires_grad": false,
"config": {
"in_channels": 512,
"channels": 128,
"c_mults": [
6
],
"strides": [
16
],
"latent_dim": 256,
"transformer_depths": [
6
],
"checkpointing": false,
"differential": true,
"mapping_style": "none",
"dim_heads": 64,
"variable_stride": true,
"use_flash": true,
"dyt": true,
"chunk_size": 32,
"chunk_midpoint_shift": true,
"conv_mapping": false,
"mask_noise": 0.0
}
},
"decoder": {
"type": "taae_v2",
"requires_grad": false,
"config": {
"out_channels": 512,
"channels": 128,
"c_mults": [
6
],
"strides": [
16
],
"latent_dim": 256,
"transformer_depths": [
6
],
"sinusoidal_blocks": [
0
],
"checkpointing": false,
"differential": true,
"mapping_style": "none",
"dim_heads": 64,
"variable_stride": true,
"use_flash": true,
"dyt": true,
"chunk_size": 32,
"conv_mapping": true,
"chunk_midpoint_shift": true,
"freeze_backbone": false,
"mask_noise": 0.01
}
},
"bottleneck": {
"type": "softnorm",
"requires_grad": false,
"config": {
"dim": 256,
"noise_augment_dim": 0,
"noise_regularize": true,
"auto_scale": true,
"freeze": true
}
},
"latent_dim": 256,
"downsampling_ratio": 4096,
"io_channels": 2
}
},
"conditioning": {
"configs": [
{
"id": "prompt",
"type": "t5gemma",
"config": {
"max_length": 256,
"padding_mode": "learned",
"repo_id": "stabilityai/stable-audio-3-small-sfx",
"subfolder": "t5gemma-b-b-ul2"
}
},
{
"id": "seconds_total",
"type": "number",
"config": {
"min_val": 0,
"max_val": 384,
"fourier_features_type": "expo"
}
}
],
"cond_dim": 768
},
"diffusion": {
"cross_attention_cond_ids": [
"prompt",
"seconds_total"
],
"local_add_cond_ids": [
"inpaint_mask",
"inpaint_masked_input"
],
"global_cond_ids": [
"seconds_total"
],
"type": "dit",
"diffusion_objective": "rf_denoiser",
"use_effective_length_for_schedule": true,
"mask_padding_attention": true,
"distribution_shift_options": {
"type": "full",
"min_length": 256,
"max_length": 4096
},
"config": {
"io_channels": 256,
"embed_dim": 1024,
"depth": 20,
"num_heads": 16,
"cond_token_dim": 768,
"global_cond_dim": 768,
"local_add_cond_dim": 257,
"global_cond_type": "adaLN",
"timestep_features_type": "expo",
"timestep_features_logsnr": false,
"attn_kwargs": {
"qk_norm": "rms",
"differential": false
},
"norm_type": "rms_norm",
"norm_kwargs": {
"force_fp32": true
},
"ff_kwargs": {
"mult": 4.0
},
"num_memory_tokens": 64
}
},
"io_channels": 256
},
"training": {
"use_ema": true,
"log_loss_info": false,
"pre_encoded": true,
"silence_extension_scale_seconds": 4.0,
"cfg_dropout_prob": 0.1,
"mask_loss_weight": 0.2,
"timestep_sampler": "trunc_logit_normal",
"inpainting": {
"mask_kwargs": {
"mask_type_probabilities": [
0.1,
0.8,
0.1
]
}
},
"log_every_n_steps": 100,
"arc": {
"noise_dist": {
"generator": "trunc_logit_normal",
"discriminator": "logit_normal"
},
"use_model_as_discriminator": true,
"discriminator_base_ckpt": "/path/to/discriminator/ckpt.pt",
"disc_update_interval": 2,
"discriminator": {
"type": "dilated_conv",
"freeze_backbone": true,
"dit_hidden_layer": [
14
],
"weights": {
"generator": 1.0,
"discriminator": 1.0
},
"loss_type": "relativistic",
"config": {
"dilations": [
1,
1,
1,
1,
1
],
"hidden_dim": 1024
},
"disc_hinge_loss": false,
"contrastive": true,
"include_grad_penalties": false
}
},
"optimizer_configs": {
"diffusion": {
"optimizer": {
"type": "MuonAdamW",
"config": {
"muon_lr": 1e-05,
"muon_momentum": 0.95,
"fused_layer_patterns": [
"*.to_qkv.*",
"*.to_kv.*",
"*.to_q.*",
"*.ff.*.proj.*"
],
"adam_lr": 1e-06,
"adam_betas": [
0.9,
0.95
],
"adam_weight_decay": 0.01
}
},
"scheduler": {
"type": "InverseLR",
"config": {
"inv_gamma": 1000000,
"power": 0.5,
"warmup": 0.9
}
}
},
"discriminator": {
"optimizer": {
"type": "MuonAdamW",
"config": {
"muon_lr": 1e-05,
"muon_momentum": 0.95,
"fused_layer_patterns": [
"*.to_qkv.*",
"*.to_kv.*",
"*.to_q.*",
"*.ff.*.proj.*"
],
"adam_lr": 1e-06,
"adam_betas": [
0.9,
0.95
],
"adam_weight_decay": 0.01
}
},
"scheduler": {
"type": "InverseLR",
"config": {
"inv_gamma": 1000000,
"power": 0.5,
"warmup": 0.9
}
}
}
},
"demo": {
"demo_every": 1000,
"demo_steps": 8,
"demo_cfg_scales": [
1
],
"demo_cond": [
{
"prompt": "Futuristic laser blast, sharp energy pulse, stereo movement, arcade style",
"seconds_total": 10
},
{
"prompt": "Dog barking next to a waterfall",
"seconds_total": 10
},
{
"prompt": "Sparkling fantasy energy swirl, mystical shimmer, rising magical burst",
"seconds_total": 9
},
{
"prompt": "Running footsteps on pavement, fast pace, urban street environment, energetic motion sound",
"seconds_total": 35
}
]
}
}
}