JusperLee
/

Dolphin

+---
+tags:
+- model_hub_mixin
+- pytorch_model_hub_mixin
+---
+This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
+- Code: [More Information Needed]
+- Paper: [More Information Needed]
+- Docs: [More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,138 @@

+{
+  "architectures": [
+    "Dolphin"
+  ],
+  "auto_map": {
+    "AutoModel": "dolphin.Dolphin"
+  },
+  "framework": "pytorch",
+  "license": "apache-2.0",
+  "model_config": {
+    "module_audio_dec": {
+      "bias": false,
+      "in_channels": 256,
+      "kernel_size": 16,
+      "out_channels": 1,
+      "stride": 4
+    },
+    "module_audio_enc": {
+      "bias": false,
+      "groups": 1,
+      "in_channels": 1,
+      "kernel_size": 16,
+      "out_channels": 256,
+      "stride": 4
+    },
+    "module_feature_projector": {
+      "bias": false,
+      "in_channels": 256,
+      "kernel_size": 1,
+      "num_channels": 256,
+      "out_channels": 128
+    },
+    "module_output_layer": {
+      "in_channels": 256,
+      "out_channels": 128
+    },
+    "module_separator": {
+      "dec_stage": {
+        "global_blocks": {
+          "dropout_rate": 0.05,
+          "in_channels": 128,
+          "num_mha_heads": 8
+        },
+        "local_blocks": {
+          "dropout_rate": 0.05,
+          "in_channels": 128,
+          "kernel_size": 65
+        },
+        "spk_attention": {
+          "dropout_rate": 0.05,
+          "in_channels": 128,
+          "num_mha_heads": 8
+        }
+      },
+      "enc_stage": {
+        "down_conv_layer": {
+          "in_channels": 128,
+          "samp_kernel_size": 5
+        },
+        "global_blocks": {
+          "dropout_rate": 0.05,
+          "in_channels": 128,
+          "num_mha_heads": 8
+        },
+        "local_blocks": {
+          "dropout_rate": 0.05,
+          "in_channels": 128,
+          "kernel_size": 65
+        }
+      },
+      "num_stages": 4,
+      "relative_positional_encoding": {
+        "embed_v": false,
+        "in_channels": 128,
+        "maxlen": 2000,
+        "num_heads": 8
+      },
+      "simple_fusion": {
+        "out_channels": 128
+      }
+    },
+    "num_stages": 4,
+    "sample_rate": 16000,
+    "video_encoder_params": {
+      "attn_dim_head": 32,
+      "attn_dropout": 0.0,
+      "attn_heads": 8,
+      "codebook_dim": 64,
+      "codebook_size": 256,
+      "commitment_cost": 1.0,
+      "distill_cost": 1.0,
+      "flash_attn": true,
+      "image_size": 88,
+      "in_channel": 1,
+      "init_channel": 4,
+      "input_conv_kernel_size": [
+        7,
+        7,
+        7
+      ],
+      "layers": [
+        "residual",
+        "compress_space",
+        "consecutive_residual",
+        "compress_space",
+        "consecutive_residual",
+        "linear_attend_space",
+        "compress_space",
+        "consecutive_residual",
+        "attend_space"
+      ],
+      "linear_attn_dim_head": 8,
+      "linear_attn_heads": 16,
+      "max_dim": 32,
+      "num_quantizers": 1,
+      "output_conv_kernel_size": [
+        3,
+        3,
+        3
+      ],
+      "pad_mode": "constant",
+      "residual_conv_kernel_size": 3
+    },
+    "vin_channels": 64,
+    "vmid_channels": 512,
+    "vout_channels": 64,
+    "vpre_channels": 3872
+  },
+  "model_type": "dolphin",
+  "tags": [
+    "audio",
+    "speech-separation",
+    "audio-visual",
+    "pytorch",
+    "dolphin"
+  ],
+  "task": "audio_visual_speech_separation"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9be694e4150588ca0af8447fae184b6262a3cf43587928bd6001eee5b4eefb8a
+size 28391276