File size: 7,802 Bytes
f233a16
 
 
 
 
 
 
 
d709234
f233a16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
{
    "model_type": "diffusion_cond_inpaint",
    "sample_size": 16777216,
    "sample_rate": 44100,
    "audio_channels": 2,
    "model": {
        "pretransform": {
            "type": "autoencoder",
            "iterate_batch": false,
            "chunked": true,
            "config": {
                "pretransform": {
                    "type": "patched",
                    "config": {
                        "patch_size": 256,
                        "channels": 2
                    }
                },
                "encoder": {
                    "type": "taae_v2",
                    "requires_grad": false,
                    "config": {
                        "in_channels": 512,
                        "channels": 256,
                        "c_mults": [
                            6
                        ],
                        "strides": [
                            16
                        ],
                        "latent_dim": 256,
                        "transformer_depths": [
                            12
                        ],
                        "use_snake": false,
                        "use_dilated_conv": false,
                        "checkpointing": true,
                        "conformer": false,
                        "layer_scale": false,
                        "differential": true,
                        "conv_bias": false,
                        "mapping_style": "none",
                        "dim_heads": 64,
                        "enable_inner_layer_dropout": false,
                        "sliding_window": [
                            1,
                            1
                        ],
                        "variable_stride": true,
                        "use_flash": true,
                        "mask_noise": 0.001
                    }
                },
                "decoder": {
                    "type": "taae_v2",
                    "requires_grad": false,
                    "config": {
                        "out_channels": 512,
                        "channels": 256,
                        "c_mults": [
                            6
                        ],
                        "strides": [
                            16
                        ],
                        "latent_dim": 256,
                        "transformer_depths": [
                            12
                        ],
                        "sinusoidal_blocks": [
                            8
                        ],
                        "use_snake": false,
                        "use_dilated_conv": false,
                        "checkpointing": false,
                        "conformer": false,
                        "layer_scale": false,
                        "differential": true,
                        "conv_bias": false,
                        "mapping_style": "none",
                        "dim_heads": 64,
                        "enable_inner_layer_dropout": false,
                        "sliding_window": [
                            1,
                            1
                        ],
                        "variable_stride": true,
                        "use_flash": true,
                        "mask_noise": 0.1
                    }
                },
                "bottleneck": {
                    "type": "softnorm",
                    "config": {
                        "dim": 256,
                        "noise_augment_dim": 0,
                        "noise_regularize": true,
                        "auto_scale": true
                    }
                },
                "latent_dim": 256,
                "downsampling_ratio": 4096,
                "io_channels": 2
            }
        },
        "conditioning": {
            "configs": [
                {
                    "id": "prompt",
                    "type": "t5gemma",
                    "config": {
                        "max_length": 256,
                        "padding_mode": "learned",
                        "repo_id": "stabilityai/stable-audio-3-medium",
                        "subfolder": "t5gemma-b-b-ul2"
                    }
                },
                {
                    "id": "seconds_total",
                    "type": "number",
                    "config": {
                        "min_val": 0,
                        "max_val": 384,
                        "fourier_features_type": "expo"
                    }
                }
            ],
            "cond_dim": 768
        },
        "diffusion": {
            "cross_attention_cond_ids": [
                "prompt",
                "seconds_total"
            ],
            "global_cond_ids": [
                "seconds_total"
            ],
            "local_add_cond_ids": [
                "inpaint_mask",
                "inpaint_masked_input"
            ],
            "type": "dit",
            "diffusion_objective": "rectified_flow",
            "mask_padding_attention": true,
            "use_effective_length_for_schedule": true,
            "distribution_shift_options": {
                "min_length": 256,
                "max_length": 4096
            },
            "config": {
                "io_channels": 256,
                "embed_dim": 1536,
                "depth": 24,
                "num_heads": 24,
                "cond_token_dim": 768,
                "global_cond_dim": 768,
                "local_add_cond_dim": 257,
                "global_cond_type": "adaLN",
                "timestep_features_type": "expo",
                "attn_kwargs": {
                    "qk_norm": "rms",
                    "differential": true
                },
                "norm_type": "rms_norm",
                "norm_kwargs": {
                    "force_fp32": true
                },
                "ff_kwargs": {
                    "mult": 4.0
                },
                "num_memory_tokens": 64
            }
        },
        "io_channels": 256
    },
    "training": {
        "use_ema": true,
        "log_loss_info": false,
        "pre_encoded": true,
        "ot_coupling": true,
        "silence_extension_scale_seconds": 4.0,
        "timestep_sampler": "trunc_logit_normal",
        "mask_loss_weight": 1.0,
        "inpainting": {
            "mask_kwargs": {
                "mask_type_probabilities": [
                    0.1,
                    0.8,
                    0.1
                ]
            }
        },
        "optimizer_configs": {
            "diffusion": {
                "optimizer": {
                    "type": "MuonAdamW",
                    "config": {
                        "muon_lr": 0.001,
                        "muon_momentum": 0.95,
                        "adam_lr": 5e-05,
                        "adam_betas": [
                            0.9,
                            0.95
                        ],
                        "adam_weight_decay": 0.01,
                        "fused_layer_patterns": [
                            "*.to_qkv.*",
                            "*.to_kv.*",
                            "*.to_q.*",
                            "*.ff.*.proj.*"
                        ]
                    }
                },
                "scheduler": {
                    "type": "InverseLR",
                    "config": {
                        "inv_gamma": 1000000,
                        "power": 0.5,
                        "warmup": 0.995
                    }
                }
            }
        },
        "demo": {
            "demo_every": 500,
            "demo_steps": 50,
            "num_demos": 4,
            "demo_cfg_scales": [
                2,
                4,
                7
            ]
        }
    }
}