| { |
| "model": { |
| "extractor_mode": "layer_norm", |
| "extractor_conv_bias": false, |
| "extractor_conv_layer_config": [ |
| [ |
| 512, |
| 10, |
| 5 |
| ], |
| [ |
| 512, |
| 3, |
| 2 |
| ], |
| [ |
| 512, |
| 3, |
| 2 |
| ], |
| [ |
| 512, |
| 3, |
| 2 |
| ], |
| [ |
| 512, |
| 3, |
| 2 |
| ], |
| [ |
| 512, |
| 2, |
| 2 |
| ], |
| [ |
| 512, |
| 2, |
| 2 |
| ] |
| ], |
| "encoder_embed_dim": 768, |
| "encoder_projection_dropout": 0, |
| "encoder_pos_conv_kernel": 95, |
| "encoder_pos_conv_groups": 16, |
| "encoder_pos_conv_depth": 5, |
| "encoder_num_layers": 12, |
| "encoder_num_heads": 12, |
| "encoder_attention_dropout": 0.1, |
| "encoder_ff_interm_features": 3072, |
| "encoder_ff_interm_dropout": 0.0, |
| "encoder_dropout": 0.1, |
| "encoder_layer_norm_first": false, |
| "encoder_layer_drop": 0.0, |
| "encoder_qkv_bias": false, |
| "codebook_size": 256, |
| "codebook_decay": 0.9, |
| "num_codebooks": 8, |
| "ema_start_decay": 0.999, |
| "ema_final_decay": 0.9999, |
| "ema_final_step": 30000, |
| "ema_exclude_layers": [ |
| "pos_conv_embed" |
| ], |
| "freeze_step": 200000, |
| "ema_timescale": 20000, |
| "ema_threshold": 1e-07 |
| } |
| } |
|
|