{ "model": { "extractor_mode": "layer_norm", "extractor_conv_bias": false, "extractor_conv_layer_config": [ [ 512, 10, 5 ], [ 512, 3, 2 ], [ 512, 3, 2 ], [ 512, 3, 2 ], [ 512, 3, 2 ], [ 512, 2, 2 ], [ 512, 2, 2 ] ], "encoder_embed_dim": 768, "encoder_projection_dropout": 0, "encoder_pos_conv_kernel": 95, "encoder_pos_conv_groups": 16, "encoder_pos_conv_depth": 5, "encoder_num_layers": 12, "encoder_num_heads": 12, "encoder_attention_dropout": 0.1, "encoder_ff_interm_features": 3072, "encoder_ff_interm_dropout": 0.0, "encoder_dropout": 0.1, "encoder_layer_norm_first": false, "encoder_layer_drop": 0.0, "encoder_qkv_bias": false, "codebook_size": 256, "codebook_decay": 0.9, "num_codebooks": 8, "ema_start_decay": 0.999, "ema_final_decay": 0.9999, "ema_final_step": 30000, "ema_exclude_layers": [ "pos_conv_embed" ], "freeze_step": 200000, "ema_timescale": 20000, "ema_threshold": 1e-07 } }