| { |
| "encoder_name": "WavLM", |
| "encoder_config": { |
| "hidden_dims": [512, 512, 512, 512, 512, 512, 512], |
| "kernel_sizes": [10, 3, 3, 3, 3, 2, 2], |
| "strides": [5, 2, 2, 2, 2, 2, 2], |
| "num_layers": 6, |
| "dim": 1024, |
| "ffn_dim": 4096, |
| "num_heads": 16, |
| "num_buckets": 320, |
| "max_distance": 800, |
| "dropout": 0.0, |
| "conv_pos": 128, |
| "conv_pos_groups": 16 |
| }, |
| "compressor_name": "FocalEncoder", |
| "compressor_config": { |
| "input_dim": 1024, |
| "output_dim": 13, |
| "hidden_dims": [1024, 512, 256], |
| "downscale_factors": [2, 1, 1], |
| "focal_window": 7, |
| "focal_level": 2, |
| "focal_factor": 2, |
| "dropout": 0.0, |
| "use_post_norm": false, |
| "use_layerscale": false, |
| "layerscale_init": 0.0001, |
| "normalize_modulator": false |
| }, |
| "quantizer_name": "BinarySphericalQuantizer", |
| "quantizer_config": { |
| "codebook_size": 8192 |
| }, |
| "decompressor_name": "FocalDecoder", |
| "decompressor_config": { |
| "input_dim": 13, |
| "output_dim": 1024, |
| "hidden_dims": [256, 512, 1024], |
| "upscale_factors": [1, 1, 2], |
| "focal_window": 7, |
| "focal_level": 2, |
| "focal_factor": 2, |
| "dropout": 0.0, |
| "use_post_norm": false, |
| "use_layerscale": false, |
| "layerscale_init": 0.0001, |
| "normalize_modulator": false |
| }, |
| "decoder_name": "Vocos", |
| "decoder_config": { |
| "input_channels": 1024, |
| "num_layers": 8, |
| "dim": 512, |
| "ffn_dim": 1536, |
| "kernel_size": 7, |
| "padding": 3, |
| "layerscale_init": null, |
| "n_fft": 1024, |
| "hop_length": 98 |
| } |
| } |