| { | |
| "model_type": "autoencoder", | |
| "sample_size": 196608, | |
| "sample_rate": 44100, | |
| "audio_channels": 2, | |
| "model": { | |
| "pretransform": { | |
| "type": "patched", | |
| "config": { | |
| "patch_size": 256, | |
| "channels": 2 | |
| } | |
| }, | |
| "encoder": { | |
| "type": "same", | |
| "requires_grad": false, | |
| "config": { | |
| "in_channels": 512, | |
| "channels": 256, | |
| "c_mults": [6], | |
| "strides": [16], | |
| "latent_dim": 256, | |
| "transformer_depths": [12], | |
| "checkpointing": true, | |
| "differential": true, | |
| "dyt": true, | |
| "dim_heads": 64, | |
| "sliding_window": [1,1], | |
| "variable_stride": true, | |
| "mask_noise": 0.001 | |
| } | |
| }, | |
| "decoder": { | |
| "type": "same", | |
| "requires_grad": false, | |
| "config": { | |
| "out_channels": 512, | |
| "channels": 256, | |
| "c_mults": [6], | |
| "strides": [16], | |
| "latent_dim": 256, | |
| "transformer_depths": [12], | |
| "sinusoidal_blocks": [8], | |
| "checkpointing": false, | |
| "differential": true, | |
| "dyt": true, | |
| "dim_heads": 64, | |
| "sliding_window": [1,1], | |
| "variable_stride": true, | |
| "mask_noise": 0.1 | |
| } | |
| }, | |
| "bottleneck": { | |
| "type": "softnorm", | |
| "config": { | |
| "dim": 256, | |
| "noise_augment_dim": 0, | |
| "noise_regularize": true, | |
| "auto_scale": true, | |
| "freeze": true | |
| } | |
| }, | |
| "latent_dim": 256, | |
| "downsampling_ratio": 4096, | |
| "io_channels": 2 | |
| } | |
| } | |