niobures commited on
Commit
d7ab592
·
verified ·
1 Parent(s): 69b1609

IIANet (code, models, paper)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ IIANet.[[:space:]]An[[:space:]]Intra-[[:space:]]and[[:space:]]Inter-Modality[[:space:]]Attention[[:space:]]Network[[:space:]]for[[:space:]]Audio-Visual[[:space:]]Speech[[:space:]]Separation.pdf filter=lfs diff=lfs merge=lfs -text
IIANet. An Intra- and Inter-Modality Attention Network for Audio-Visual Speech Separation.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e360c88b42e3e94a2a93f5903e4a214bd8f10ee113f54c1bf699a44b8f5b7b15
3
+ size 2306127
code/IIANet [RickyQzh] +2.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33aa1a27cadba1ba71f367d74956db158ad3dfb723440d255c7ce130782a2d4a
3
+ size 227656344
code/IIANet.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef8b1c1d57140ce3f7ca09f9abdb2bd2435a06d20fb00837b8a37b4b72d05ba5
3
+ size 227617861
models/LRS2-IIANet.yml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Network config
2
+ audionet:
3
+ audionet_name: IIANet
4
+ audionet_config:
5
+ out_channels: 128
6
+ in_channels: 512
7
+ vpre_channels: 512
8
+ vin_channels: 64
9
+ vout_channels: 64
10
+ num_blocks: 16
11
+ upsampling_depth: 5
12
+ enc_kernel_size: 1 # ms
13
+ num_sources: 1
14
+
15
+ videonet:
16
+ videonet_name: ResNetVideoModel
17
+ videonet_config:
18
+ pretrain: pretrain_zoo/lrw_resnet18_mstcn_adamw_s3.pth.tar
19
+
20
+ # Loss config
21
+ loss:
22
+ train:
23
+ loss_func: PITLossWrapper
24
+ sdr_type: pairwise_neg_snr
25
+ config:
26
+ pit_from: pw_mtx
27
+ threshold_byloss: false
28
+ val:
29
+ loss_func: PITLossWrapper
30
+ sdr_type: pairwise_neg_sisdr
31
+ config:
32
+ pit_from: pw_mtx
33
+ threshold_byloss: false
34
+
35
+ # Training config
36
+ training:
37
+ system: AudioVisualLightningModule
38
+ gpus: [0,1,2,3,4,5,6,7]
39
+ parallel: ddp
40
+ epochs: 500
41
+ early_stop:
42
+ monitor: val_loss/dataloader_idx_0
43
+ mode: min
44
+ patience: 20
45
+ verbose: true
46
+
47
+ # Optim config
48
+ optimizer:
49
+ optim_name: adam
50
+ lr: 0.001
51
+ weight_decay: 0.
52
+
53
+ # Sche config
54
+ scheduler:
55
+ sche_name: ReduceLROnPlateau
56
+ sche_config:
57
+ patience: 10
58
+ factor: 0.5
59
+
60
+ # Data config
61
+ datamodule:
62
+ data_name: AVSpeechDyanmicDataModule
63
+ data_config:
64
+ train_dir: DataPreProcess/LRS2/tr
65
+ valid_dir: DataPreProcess/LRS2/cv
66
+ test_dir: DataPreProcess/LRS2/tt
67
+ n_src: 1
68
+ sample_rate: 16000
69
+ segment: 2.0
70
+ normalize_audio: false
71
+ batch_size: 3
72
+ num_workers: 24
73
+ pin_memory: true
74
+ persistent_workers: false
75
+
76
+ exp:
77
+ exp_name: LRS2-IIANet
models/LRS3-IIANet.yml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Network config
2
+ audionet:
3
+ audionet_name: IIANet
4
+ audionet_config:
5
+ out_channels: 128
6
+ in_channels: 512
7
+ vpre_channels: 512
8
+ vin_channels: 64
9
+ vout_channels: 64
10
+ num_blocks: 16
11
+ upsampling_depth: 5
12
+ enc_kernel_size: 1 # ms
13
+ num_sources: 1
14
+
15
+ videonet:
16
+ videonet_name: ResNetVideoModel
17
+ videonet_config:
18
+ pretrain: pretrain_zoo/lrw_resnet18_mstcn_adamw_s3.pth.tar
19
+
20
+ # Loss config
21
+ loss:
22
+ train:
23
+ loss_func: PITLossWrapper
24
+ sdr_type: pairwise_neg_snr
25
+ config:
26
+ pit_from: pw_mtx
27
+ threshold_byloss: false
28
+ val:
29
+ loss_func: PITLossWrapper
30
+ sdr_type: pairwise_neg_sisdr
31
+ config:
32
+ pit_from: pw_mtx
33
+ threshold_byloss: false
34
+
35
+ # Training config
36
+ training:
37
+ system: AudioVisualLightningModule
38
+ gpus: [0,1,2,3,4,5,6,7]
39
+ parallel: ddp
40
+ epochs: 500
41
+ early_stop:
42
+ monitor: val_loss/dataloader_idx_0
43
+ mode: min
44
+ patience: 20
45
+ verbose: true
46
+
47
+ # Optim config
48
+ optimizer:
49
+ optim_name: adam
50
+ lr: 0.001
51
+ weight_decay: 0.
52
+
53
+ # Sche config
54
+ scheduler:
55
+ sche_name: ReduceLROnPlateau
56
+ sche_config:
57
+ patience: 10
58
+ factor: 0.5
59
+
60
+ # Data config
61
+ datamodule:
62
+ data_name: AVSpeechDyanmicDataModule
63
+ data_config:
64
+ train_dir: DataPreProcess/LRS3/tr
65
+ valid_dir: DataPreProcess/LRS3/cv
66
+ test_dir: DataPreProcess/LRS3/tt
67
+ n_src: 1
68
+ sample_rate: 16000
69
+ segment: 2.0
70
+ normalize_audio: false
71
+ batch_size: 3
72
+ num_workers: 24
73
+ pin_memory: true
74
+ persistent_workers: false
75
+
76
+ exp:
77
+ exp_name: LRS3-IIANet
models/Vox2-IIANet.yml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Network config
2
+ audionet:
3
+ audionet_name: IIANet
4
+ audionet_config:
5
+ out_channels: 128
6
+ in_channels: 512
7
+ vpre_channels: 512
8
+ vin_channels: 64
9
+ vout_channels: 64
10
+ num_blocks: 16
11
+ upsampling_depth: 5
12
+ enc_kernel_size: 1 # ms
13
+ num_sources: 1
14
+
15
+ videonet:
16
+ videonet_name: ResNetVideoModel
17
+ videonet_config:
18
+ pretrain: pretrain_zoo/lrw_resnet18_mstcn_adamw_s3.pth.tar
19
+
20
+ # Loss config
21
+ loss:
22
+ train:
23
+ loss_func: PITLossWrapper
24
+ sdr_type: pairwise_neg_snr
25
+ config:
26
+ pit_from: pw_mtx
27
+ threshold_byloss: false
28
+ val:
29
+ loss_func: PITLossWrapper
30
+ sdr_type: pairwise_neg_sisdr
31
+ config:
32
+ pit_from: pw_mtx
33
+ threshold_byloss: false
34
+
35
+ # Training config
36
+ training:
37
+ system: AudioVisualLightningModule
38
+ gpus: [0,1,2,3,4,5,6,7]
39
+ parallel: ddp
40
+ epochs: 500
41
+ early_stop:
42
+ monitor: val_loss/dataloader_idx_0
43
+ mode: min
44
+ patience: 20
45
+ verbose: true
46
+
47
+ # Optim config
48
+ optimizer:
49
+ optim_name: adam
50
+ lr: 0.001
51
+ weight_decay: 0.
52
+
53
+ # Sche config
54
+ scheduler:
55
+ sche_name: ReduceLROnPlateau
56
+ sche_config:
57
+ patience: 10
58
+ factor: 0.5
59
+
60
+ # Data config
61
+ datamodule:
62
+ data_name: AVSpeechDyanmicDataModule
63
+ data_config:
64
+ train_dir: DataPreProcess/Vox2/tr
65
+ valid_dir: DataPreProcess/Vox2/cv
66
+ test_dir: DataPreProcess/Vox2/tt
67
+ n_src: 1
68
+ sample_rate: 16000
69
+ segment: 2.0
70
+ normalize_audio: false
71
+ batch_size: 3
72
+ num_workers: 24
73
+ pin_memory: true
74
+ persistent_workers: false
75
+
76
+ exp:
77
+ exp_name: Vox2-IIANet
models/lrs2.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d45cb6e0c8e0a03eceb57e269c456014d863496d9ff3cca399c1fed538a790d
3
+ size 11322210
models/lrs3.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d30164409cb9e43cec2203c16383b6476910d80f366e34ec748b11ab09e8ca3
3
+ size 33866445
models/vox2.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0455bb8b633129811243d61a0fa3d3deb29f05bfb7385751d9228ed3cb7999b8
3
+ size 11316948