| { |
| "model_type": "dolphin", |
| "task": "audio_visual_speech_separation", |
| "framework": "pytorch", |
| "license": "apache-2.0", |
| "tags": [ |
| "audio", |
| "speech-separation", |
| "audio-visual", |
| "pytorch", |
| "dolphin" |
| ], |
| "architectures": [ |
| "Dolphin" |
| ], |
| "auto_map": { |
| "AutoModel": "dolphin.Dolphin" |
| }, |
| "num_stages": 4, |
| "sample_rate": 16000, |
| "vpre_channels": 3872, |
| "vmid_channels": 512, |
| "vin_channels": 64, |
| "vout_channels": 64, |
| "module_audio_enc": { |
| "in_channels": 1, |
| "out_channels": 256, |
| "kernel_size": 16, |
| "stride": 4, |
| "groups": 1, |
| "bias": false |
| }, |
| "module_feature_projector": { |
| "num_channels": 256, |
| "in_channels": 256, |
| "out_channels": 128, |
| "kernel_size": 1, |
| "bias": false |
| }, |
| "module_separator": { |
| "num_stages": 4, |
| "relative_positional_encoding": { |
| "in_channels": 128, |
| "num_heads": 8, |
| "maxlen": 2000, |
| "embed_v": false |
| }, |
| "enc_stage": { |
| "global_blocks": { |
| "in_channels": 128, |
| "num_mha_heads": 8, |
| "dropout_rate": 0.05 |
| }, |
| "local_blocks": { |
| "in_channels": 128, |
| "kernel_size": 65, |
| "dropout_rate": 0.05 |
| }, |
| "down_conv_layer": { |
| "in_channels": 128, |
| "samp_kernel_size": 5 |
| } |
| }, |
| "simple_fusion": { |
| "out_channels": 128 |
| }, |
| "dec_stage": { |
| "global_blocks": { |
| "in_channels": 128, |
| "num_mha_heads": 8, |
| "dropout_rate": 0.05 |
| }, |
| "local_blocks": { |
| "in_channels": 128, |
| "kernel_size": 65, |
| "dropout_rate": 0.05 |
| }, |
| "spk_attention": { |
| "in_channels": 128, |
| "num_mha_heads": 8, |
| "dropout_rate": 0.05 |
| } |
| } |
| }, |
| "module_output_layer": { |
| "in_channels": 256, |
| "out_channels": 128 |
| }, |
| "module_audio_dec": { |
| "in_channels": 256, |
| "out_channels": 1, |
| "kernel_size": 16, |
| "stride": 4, |
| "bias": false |
| }, |
| "video_encoder_params": { |
| "layers": [ |
| "residual", |
| "compress_space", |
| "consecutive_residual", |
| "compress_space", |
| "consecutive_residual", |
| "linear_attend_space", |
| "compress_space", |
| "consecutive_residual", |
| "attend_space" |
| ], |
| "image_size": 88, |
| "in_channel": 1, |
| "init_channel": 4, |
| "max_dim": 32, |
| "input_conv_kernel_size": [ |
| 7, |
| 7, |
| 7 |
| ], |
| "output_conv_kernel_size": [ |
| 3, |
| 3, |
| 3 |
| ], |
| "residual_conv_kernel_size": 3, |
| "pad_mode": "constant", |
| "attn_dim_head": 32, |
| "attn_heads": 8, |
| "attn_dropout": 0.0, |
| "flash_attn": true, |
| "linear_attn_dim_head": 8, |
| "linear_attn_heads": 16, |
| "num_quantizers": 1, |
| "codebook_size": 256, |
| "codebook_dim": 64, |
| "commitment_cost": 1.0, |
| "distill_cost": 1.0 |
| } |
| } |