| { | |
| "model_type": "autoencoder", | |
| "sample_size": 24576, | |
| "sample_rate": 44100, | |
| "audio_channels": 2, | |
| "model": { | |
| "pretransform": { | |
| "type": "patched", | |
| "config": { | |
| "patch_size": 256, | |
| "channels": 2 | |
| } | |
| }, | |
| "encoder": { | |
| "type": "same", | |
| "requires_grad": false, | |
| "config": { | |
| "in_channels": 512, | |
| "channels": 128, | |
| "c_mults": [6], | |
| "strides": [16], | |
| "latent_dim": 256, | |
| "transformer_depths": [6], | |
| "checkpointing": false, | |
| "differential": true, | |
| "dyt": true, | |
| "dim_heads": 64, | |
| "variable_stride": true, | |
| "chunk_size": 32, | |
| "chunk_midpoint_shift": true, | |
| "mask_noise": 0.0 | |
| } | |
| }, | |
| "decoder": { | |
| "type": "same", | |
| "requires_grad": false, | |
| "config": { | |
| "out_channels": 512, | |
| "channels": 128, | |
| "c_mults": [6], | |
| "strides": [16], | |
| "latent_dim": 256, | |
| "transformer_depths": [6], | |
| "sinusoidal_blocks": [0], | |
| "checkpointing": false, | |
| "differential": true, | |
| "dyt": true, | |
| "dim_heads": 64, | |
| "variable_stride": true, | |
| "chunk_size": 32, | |
| "chunk_midpoint_shift": true, | |
| "conv_mapping": true, | |
| "mask_noise": 0.01 | |
| } | |
| }, | |
| "bottleneck": { | |
| "type": "softnorm", | |
| "config": { | |
| "dim": 256, | |
| "noise_augment_dim": 0, | |
| "noise_regularize": true, | |
| "auto_scale": true, | |
| "freeze": true | |
| } | |
| }, | |
| "latent_dim": 256, | |
| "downsampling_ratio": 4096, | |
| "io_channels": 2 | |
| } | |
| } | |