stable-audio-3-mirrors / medium /model_config.json
AEmotionStudio's picture
Mirror stabilityai/stable-audio-3-medium → medium/
71462fd verified
{
"model_type": "diffusion_cond_inpaint",
"sample_size": 16777216,
"sample_rate": 44100,
"audio_channels": 2,
"model": {
"pretransform": {
"type": "autoencoder",
"iterate_batch": false,
"chunked": true,
"config": {
"pretransform": {
"type": "patched",
"config": {
"patch_size": 256,
"channels": 2
}
},
"encoder": {
"type": "taae_v2",
"requires_grad": false,
"config": {
"in_channels": 512,
"channels": 256,
"c_mults": [
6
],
"strides": [
16
],
"latent_dim": 256,
"transformer_depths": [
12
],
"use_snake": false,
"use_dilated_conv": false,
"checkpointing": true,
"conformer": false,
"layer_scale": false,
"differential": true,
"conv_bias": false,
"mapping_style": "none",
"dim_heads": 64,
"enable_inner_layer_dropout": false,
"sliding_window": [
1,
1
],
"variable_stride": true,
"use_flash": true,
"mask_noise": 0.001
}
},
"decoder": {
"type": "taae_v2",
"requires_grad": false,
"config": {
"out_channels": 512,
"channels": 256,
"c_mults": [
6
],
"strides": [
16
],
"latent_dim": 256,
"transformer_depths": [
12
],
"sinusoidal_blocks": [
8
],
"use_snake": false,
"use_dilated_conv": false,
"checkpointing": false,
"conformer": false,
"layer_scale": false,
"differential": true,
"conv_bias": false,
"mapping_style": "none",
"dim_heads": 64,
"enable_inner_layer_dropout": false,
"sliding_window": [
1,
1
],
"variable_stride": true,
"use_flash": true,
"mask_noise": 0.1
}
},
"bottleneck": {
"type": "softnorm",
"config": {
"dim": 256,
"noise_augment_dim": 0,
"noise_regularize": true,
"auto_scale": true
}
},
"latent_dim": 256,
"downsampling_ratio": 4096,
"io_channels": 2
}
},
"conditioning": {
"configs": [
{
"id": "prompt",
"type": "t5gemma",
"config": {
"max_length": 256,
"padding_mode": "learned",
"repo_id": "stabilityai/stable-audio-3-medium",
"subfolder": "t5gemma-b-b-ul2"
}
},
{
"id": "seconds_total",
"type": "number",
"config": {
"min_val": 0,
"max_val": 384,
"fourier_features_type": "expo"
}
}
],
"cond_dim": 768
},
"diffusion": {
"cross_attention_cond_ids": [
"prompt",
"seconds_total"
],
"global_cond_ids": [
"seconds_total"
],
"local_add_cond_ids": [
"inpaint_mask",
"inpaint_masked_input"
],
"type": "dit",
"diffusion_objective": "rf_denoiser",
"mask_padding_attention": true,
"use_effective_length_for_schedule": true,
"distribution_shift_options": {
"min_length": 256,
"max_length": 4096
},
"config": {
"io_channels": 256,
"embed_dim": 1536,
"depth": 24,
"num_heads": 24,
"cond_token_dim": 768,
"global_cond_dim": 768,
"local_add_cond_dim": 257,
"global_cond_type": "adaLN",
"timestep_features_type": "expo",
"attn_kwargs": {
"qk_norm": "rms",
"differential": true
},
"norm_type": "rms_norm",
"norm_kwargs": {
"force_fp32": true
},
"ff_kwargs": {
"mult": 4.0
},
"num_memory_tokens": 64
}
},
"io_channels": 256
},
"training": {
"use_ema": true,
"log_loss_info": false,
"pre_encoded": true,
"ot_coupling": true,
"silence_extension_scale_seconds": 4.0,
"timestep_sampler": "trunc_logit_normal",
"mask_loss_weight": 1.0,
"cfg_dropout_prob": 0.1,
"inpainting": {
"mask_kwargs": {
"mask_type_probabilities": [
0.1,
0.8,
0.1
]
}
},
"arc": {
"noise_dist": {
"generator": "trunc_logit_normal",
"discriminator": "logit_normal"
},
"disc_update_interval": 2,
"use_model_as_discriminator": true,
"discriminator_base_ckpt": "/path/to/discriminator/ckpt.pt",
"discriminator": {
"type": "dilated_conv",
"dit_hidden_layer": [
18
],
"weights": {
"generator": 1.0,
"discriminator": 1.0
},
"reset_every": 250,
"loss_type": "relativistic",
"config": {
"hidden_dim": 1024,
"dilations": [
1,
1,
1,
1,
1
]
},
"disc_hinge_loss": false,
"contrastive": true,
"include_grad_penalties": false
}
},
"optimizer_configs": {
"diffusion": {
"optimizer": {
"type": "MuonAdamW",
"config": {
"muon_lr": 1e-05,
"muon_momentum": 0.95,
"adam_lr": 1e-06,
"adam_betas": [
0.9,
0.95
],
"adam_weight_decay": 0.01,
"fused_layer_patterns": [
"*.to_qkv.*",
"*.to_kv.*",
"*.to_q.*",
"*.ff.*.proj.*"
]
}
},
"scheduler": {
"type": "InverseLR",
"config": {
"inv_gamma": 1000000,
"power": 0.5,
"warmup": 0.95
}
}
},
"discriminator": {
"optimizer": {
"type": "MuonAdamW",
"config": {
"muon_lr": 1e-05,
"muon_momentum": 0.95,
"adam_lr": 1e-06,
"adam_betas": [
0.9,
0.95
],
"adam_weight_decay": 0.01,
"fused_layer_patterns": [
"*.to_qkv.*",
"*.to_kv.*",
"*.to_q.*",
"*.ff.*.proj.*"
]
}
},
"scheduler": {
"type": "InverseLR",
"config": {
"inv_gamma": 1000000,
"power": 0.5,
"warmup": 0.9
}
}
}
},
"demo": {
"demo_every": 500,
"demo_steps": 8,
"num_demos": 2,
"demo_cond": [
{
"prompt": "Meditative lo-fi ambient piano jazz, soft acoustic drum kit",
"seconds_total": 190
},
{
"prompt": "A tropical house track with upbeat melodies, a driving bassline, and cheery vibes",
"seconds_total": 180
}
],
"demo_cfg_scales": [
1
]
}
}
}