IIANet (code, models, paper)

Browse files

Files changed (10) hide show

.gitattributes +1 -0
IIANet. An Intra- and Inter-Modality Attention Network for Audio-Visual Speech Separation.pdf +3 -0
code/IIANet [RickyQzh] +2.zip +3 -0
code/IIANet.zip +3 -0
models/LRS2-IIANet.yml +77 -0
models/LRS3-IIANet.yml +77 -0
models/Vox2-IIANet.yml +77 -0
models/lrs2.zip +3 -0
models/lrs3.zip +3 -0
models/vox2.zip +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+IIANet.[[:space:]]An[[:space:]]Intra-[[:space:]]and[[:space:]]Inter-Modality[[:space:]]Attention[[:space:]]Network[[:space:]]for[[:space:]]Audio-Visual[[:space:]]Speech[[:space:]]Separation.pdf filter=lfs diff=lfs merge=lfs -text

IIANet. An Intra- and Inter-Modality Attention Network for Audio-Visual Speech Separation.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e360c88b42e3e94a2a93f5903e4a214bd8f10ee113f54c1bf699a44b8f5b7b15
+size 2306127

code/IIANet [RickyQzh] +2.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33aa1a27cadba1ba71f367d74956db158ad3dfb723440d255c7ce130782a2d4a
+size 227656344

code/IIANet.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef8b1c1d57140ce3f7ca09f9abdb2bd2435a06d20fb00837b8a37b4b72d05ba5
+size 227617861

models/LRS2-IIANet.yml ADDED Viewed

	@@ -0,0 +1,77 @@

+# Network config
+audionet:
+  audionet_name: IIANet
+  audionet_config:
+    out_channels: 128
+    in_channels: 512
+    vpre_channels: 512
+    vin_channels: 64
+    vout_channels: 64
+    num_blocks: 16
+    upsampling_depth: 5
+    enc_kernel_size: 1 # ms
+    num_sources: 1
+videonet:
+  videonet_name: ResNetVideoModel
+  videonet_config:
+    pretrain: pretrain_zoo/lrw_resnet18_mstcn_adamw_s3.pth.tar
+# Loss config
+loss:
+  train:
+    loss_func: PITLossWrapper
+    sdr_type: pairwise_neg_snr
+    config:
+      pit_from: pw_mtx
+      threshold_byloss: false
+  val:
+    loss_func: PITLossWrapper
+    sdr_type: pairwise_neg_sisdr
+    config:
+      pit_from: pw_mtx
+      threshold_byloss: false
+# Training config
+training:
+  system: AudioVisualLightningModule
+  gpus: [0,1,2,3,4,5,6,7]
+  parallel: ddp
+  epochs: 500
+  early_stop:
+    monitor: val_loss/dataloader_idx_0
+    mode: min
+    patience: 20
+    verbose: true
+# Optim config
+optimizer:
+  optim_name: adam
+  lr: 0.001
+  weight_decay: 0.
+# Sche config
+scheduler:
+  sche_name: ReduceLROnPlateau
+  sche_config:
+    patience: 10
+    factor: 0.5
+# Data config
+datamodule:
+  data_name: AVSpeechDyanmicDataModule
+  data_config:
+    train_dir: DataPreProcess/LRS2/tr
+    valid_dir: DataPreProcess/LRS2/cv
+    test_dir: DataPreProcess/LRS2/tt
+    n_src: 1
+    sample_rate: 16000
+    segment: 2.0
+    normalize_audio: false
+    batch_size: 3
+    num_workers: 24
+    pin_memory: true
+    persistent_workers: false
+exp:
+  exp_name: LRS2-IIANet

models/LRS3-IIANet.yml ADDED Viewed

	@@ -0,0 +1,77 @@

+# Network config
+audionet:
+  audionet_name: IIANet
+  audionet_config:
+    out_channels: 128
+    in_channels: 512
+    vpre_channels: 512
+    vin_channels: 64
+    vout_channels: 64
+    num_blocks: 16
+    upsampling_depth: 5
+    enc_kernel_size: 1 # ms
+    num_sources: 1
+videonet:
+  videonet_name: ResNetVideoModel
+  videonet_config:
+    pretrain: pretrain_zoo/lrw_resnet18_mstcn_adamw_s3.pth.tar
+# Loss config
+loss:
+  train:
+    loss_func: PITLossWrapper
+    sdr_type: pairwise_neg_snr
+    config:
+      pit_from: pw_mtx
+      threshold_byloss: false
+  val:
+    loss_func: PITLossWrapper
+    sdr_type: pairwise_neg_sisdr
+    config:
+      pit_from: pw_mtx
+      threshold_byloss: false
+# Training config
+training:
+  system: AudioVisualLightningModule
+  gpus: [0,1,2,3,4,5,6,7]
+  parallel: ddp
+  epochs: 500
+  early_stop:
+    monitor: val_loss/dataloader_idx_0
+    mode: min
+    patience: 20
+    verbose: true
+# Optim config
+optimizer:
+  optim_name: adam
+  lr: 0.001
+  weight_decay: 0.
+# Sche config
+scheduler:
+  sche_name: ReduceLROnPlateau
+  sche_config:
+    patience: 10
+    factor: 0.5
+# Data config
+datamodule:
+  data_name: AVSpeechDyanmicDataModule
+  data_config:
+    train_dir: DataPreProcess/LRS3/tr
+    valid_dir: DataPreProcess/LRS3/cv
+    test_dir: DataPreProcess/LRS3/tt
+    n_src: 1
+    sample_rate: 16000
+    segment: 2.0
+    normalize_audio: false
+    batch_size: 3
+    num_workers: 24
+    pin_memory: true
+    persistent_workers: false
+exp:
+  exp_name: LRS3-IIANet

models/Vox2-IIANet.yml ADDED Viewed

	@@ -0,0 +1,77 @@

+# Network config
+audionet:
+  audionet_name: IIANet
+  audionet_config:
+    out_channels: 128
+    in_channels: 512
+    vpre_channels: 512
+    vin_channels: 64
+    vout_channels: 64
+    num_blocks: 16
+    upsampling_depth: 5
+    enc_kernel_size: 1 # ms
+    num_sources: 1
+videonet:
+  videonet_name: ResNetVideoModel
+  videonet_config:
+    pretrain: pretrain_zoo/lrw_resnet18_mstcn_adamw_s3.pth.tar
+# Loss config
+loss:
+  train:
+    loss_func: PITLossWrapper
+    sdr_type: pairwise_neg_snr
+    config:
+      pit_from: pw_mtx
+      threshold_byloss: false
+  val:
+    loss_func: PITLossWrapper
+    sdr_type: pairwise_neg_sisdr
+    config:
+      pit_from: pw_mtx
+      threshold_byloss: false
+# Training config
+training:
+  system: AudioVisualLightningModule
+  gpus: [0,1,2,3,4,5,6,7]
+  parallel: ddp
+  epochs: 500
+  early_stop:
+    monitor: val_loss/dataloader_idx_0
+    mode: min
+    patience: 20
+    verbose: true
+# Optim config
+optimizer:
+  optim_name: adam
+  lr: 0.001
+  weight_decay: 0.
+# Sche config
+scheduler:
+  sche_name: ReduceLROnPlateau
+  sche_config:
+    patience: 10
+    factor: 0.5
+# Data config
+datamodule:
+  data_name: AVSpeechDyanmicDataModule
+  data_config:
+    train_dir: DataPreProcess/Vox2/tr
+    valid_dir: DataPreProcess/Vox2/cv
+    test_dir: DataPreProcess/Vox2/tt
+    n_src: 1
+    sample_rate: 16000
+    segment: 2.0
+    normalize_audio: false
+    batch_size: 3
+    num_workers: 24
+    pin_memory: true
+    persistent_workers: false
+exp:
+  exp_name: Vox2-IIANet

models/lrs2.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d45cb6e0c8e0a03eceb57e269c456014d863496d9ff3cca399c1fed538a790d
+size 11322210

models/lrs3.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d30164409cb9e43cec2203c16383b6476910d80f366e34ec748b11ab09e8ca3
+size 33866445

models/vox2.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0455bb8b633129811243d61a0fa3d3deb29f05bfb7385751d9228ed3cb7999b8
+size 11316948