| { |
| "model_type": "stft_autoencoder", |
| "sample_size": 25344, |
| "sample_rate": 16000, |
| "audio_channels": 1, |
| "model": { |
| "encoder": { |
| "type": "oobleck", |
| "config": { |
| "in_channels": 1, |
| "latent_dim": 256, |
| "n_fft": 512, |
| "hop_length": 256, |
| "win_length": 512, |
| "hidden_channels": 256, |
| "n_head": 4, |
| "approx_qk_dim": 512, |
| "emb_dim": 128, |
| "emb_ks": 1, |
| "emb_hs": 1, |
| "num_layers": 3 |
| } |
| }, |
| "decoder": { |
| "type": "oobleck", |
| "config": { |
| "out_channels": 1, |
| "latent_dim": 128, |
| "n_fft": 512, |
| "hop_length": 256, |
| "win_length": 512, |
| "hidden_channels": 256, |
| "n_head": 4, |
| "approx_qk_dim": 512, |
| "emb_dim": 128, |
| "emb_ks": 1, |
| "emb_hs": 1, |
| "num_layers": 3 |
| } |
| }, |
| "bottleneck": { |
| "type": "vae" |
| }, |
| "latent_dim": 128, |
| "downsampling_ratio": 256, |
| "io_channels": 1 |
| }, |
| "training": { |
| "learning_rate": 1.5e-4, |
| "warmup_steps": 0, |
| "use_ema": false, |
| "optimizer_configs": { |
| "autoencoder": { |
| "optimizer": { |
| "type": "AdamW", |
| "config": { |
| "betas": [0.8, 0.99], |
| "lr": 1.5e-4, |
| "weight_decay": 1e-3 |
| } |
| }, |
| "scheduler": { |
| "type": "InverseLR", |
| "config": { |
| "inv_gamma": 200000, |
| "power": 0.5, |
| "warmup": 0.999 |
| } |
| } |
| }, |
| "discriminator": { |
| "optimizer": { |
| "type": "AdamW", |
| "config": { |
| "betas": [0.8, 0.99], |
| "lr": 3e-4, |
| "weight_decay": 1e-3 |
| } |
| }, |
| "scheduler": { |
| "type": "InverseLR", |
| "config": { |
| "inv_gamma": 200000, |
| "power": 0.5, |
| "warmup": 0.999 |
| } |
| } |
| } |
| }, |
| "loss_configs": { |
| "discriminator": { |
| "type": "encodec", |
| "config": { |
| "filters": 64, |
| "n_ffts": [1280, 640, 320, 160, 80], |
| "hop_lengths": [320, 160, 80, 40, 20], |
| "win_lengths": [1280, 640, 320, 160, 80] |
| }, |
| "weights": { |
| "adversarial": 0.1, |
| "feature_matching": 5.0 |
| } |
| }, |
| "spectral": { |
| "type": "mrstft", |
| "config": { |
| "fft_sizes": [1280, 640, 320, 160, 80, 40, 20], |
| "hop_sizes": [320, 160, 80, 40, 20, 10, 5], |
| "win_lengths": [1280, 640, 320, 160, 80, 40, 20], |
| "perceptual_weighting": true |
| }, |
| "weights": { |
| "mrstft": 1.0 |
| } |
| }, |
| "time": { |
| "type": "l1", |
| "weights": { |
| "l1": 0.0 |
| } |
| }, |
| "bottleneck": { |
| "type": "kl", |
| "weights": { |
| "kl": 1e-4 |
| } |
| } |
| }, |
| "demo": { |
| "demo_every": 10000 |
| } |
| } |
| } |