niobures commited on Oct 24, 2025

Commit

3b5bc00

verified ·

1 Parent(s): 3daf776

SepReformer (code, models, paper)

Browse files

Files changed (38) hide show

.gitattributes +1 -0
Separate and Reconstruct. Asymmetric Encoder-Decoder for Speech Separation.pdf +3 -0
code/SepReformer.zip +3 -0
code/sepformer-tse.zip +3 -0
models/SepReformer/SepReformer_Base_WSJ0/configs.yaml +139 -0
models/SepReformer/SepReformer_Base_WSJ0/dataset.py +165 -0
models/SepReformer/SepReformer_Base_WSJ0/engine.py +216 -0
models/SepReformer/SepReformer_Base_WSJ0/log/scratch_weights/epoch.0180.pth +3 -0
models/SepReformer/SepReformer_Base_WSJ0/main.py +47 -0
models/SepReformer/SepReformer_Base_WSJ0/model.py +53 -0
models/SepReformer/SepReformer_Base_WSJ0/modules/module.py +283 -0
models/SepReformer/SepReformer_Base_WSJ0/modules/network.py +252 -0
models/SepReformer/SepReformer_Large_DM_WHAM/configs.yaml +129 -0
models/SepReformer/SepReformer_Large_DM_WHAM/dataset.py +177 -0
models/SepReformer/SepReformer_Large_DM_WHAM/engine.py +192 -0
models/SepReformer/SepReformer_Large_DM_WHAM/main.py +44 -0
models/SepReformer/SepReformer_Large_DM_WHAM/model.py +53 -0
models/SepReformer/SepReformer_Large_DM_WHAM/modules/module.py +286 -0
models/SepReformer/SepReformer_Large_DM_WHAM/modules/network.py +252 -0
models/SepReformer/SepReformer_Large_DM_WHAMR/configs.yaml +131 -0
models/SepReformer/SepReformer_Large_DM_WHAMR/dataset.py +187 -0
models/SepReformer/SepReformer_Large_DM_WHAMR/engine.py +192 -0
models/SepReformer/SepReformer_Large_DM_WHAMR/main.py +44 -0
models/SepReformer/SepReformer_Large_DM_WHAMR/model.py +53 -0
models/SepReformer/SepReformer_Large_DM_WHAMR/modules/__pycache__/module.cpython-310.pyc +0 -0
models/SepReformer/SepReformer_Large_DM_WHAMR/modules/__pycache__/module.cpython-38.pyc +0 -0
models/SepReformer/SepReformer_Large_DM_WHAMR/modules/__pycache__/network.cpython-310.pyc +0 -0
models/SepReformer/SepReformer_Large_DM_WHAMR/modules/__pycache__/network.cpython-38.pyc +0 -0
models/SepReformer/SepReformer_Large_DM_WHAMR/modules/module.py +283 -0
models/SepReformer/SepReformer_Large_DM_WHAMR/modules/network.py +252 -0
models/SepReformer/SepReformer_Large_DM_WSJ0/configs.yaml +128 -0
models/SepReformer/SepReformer_Large_DM_WSJ0/dataset.py +171 -0
models/SepReformer/SepReformer_Large_DM_WSJ0/engine.py +192 -0
models/SepReformer/SepReformer_Large_DM_WSJ0/main.py +44 -0
models/SepReformer/SepReformer_Large_DM_WSJ0/model.py +53 -0
models/SepReformer/SepReformer_Large_DM_WSJ0/modules/module.py +283 -0
models/SepReformer/SepReformer_Large_DM_WSJ0/modules/network.py +252 -0
models/SepReformer/source.txt +1 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Separate[[:space:]]and[[:space:]]Reconstruct.[[:space:]]Asymmetric[[:space:]]Encoder-Decoder[[:space:]]for[[:space:]]Speech[[:space:]]Separation.pdf filter=lfs diff=lfs merge=lfs -text

Separate and Reconstruct. Asymmetric Encoder-Decoder for Speech Separation.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97766a617509db55e816689f3f3e8e52c03b06ebf04f98f03e298a5556a4e898
+size 1952305

code/SepReformer.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc9b31d464b79b6ac037879b160445f53a4de9e4a411cce0954fc24c0ff7706d
+size 16944535

code/sepformer-tse.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13905e009d88354fcb21579e11f5dcebe3114eb2d238eab0fa4cc11f9cc237ea
+size 114198916

models/SepReformer/SepReformer_Base_WSJ0/configs.yaml ADDED Viewed

	@@ -0,0 +1,139 @@

+project: "[Project] SepReformer" ### Dont't change
+notes: "SepReformer final version" ### Insert schanges(plz write details !!!)
+# ------------------------------------------------------------------------------------------------------------------------------ #
+config:
+    # ------------------------------------------------------------ #
+    dataset:
+        max_len: 32000
+        sampling_rate: 8000
+        scp_dir: "data/scp_ss_8k"
+        train:
+            mixture: "tr_mix.scp"
+            spk1: "tr_s1.scp"
+            spk2: "tr_s2.scp"
+            dynamic_mixing: false
+        valid:
+            mixture: "cv_mix.scp"
+            spk1: "cv_s1.scp"
+            spk2: "cv_s2.scp"
+        test:
+            mixture: "tt_mix.scp"
+            spk1: "tt_s1.scp"
+            spk2: "tt_s2.scp"
+    # ------------------------------------------------------------ #
+    dataloader:
+        batch_size: 2
+        pin_memory: false
+        num_workers: 12
+        drop_last: false
+    # ------------------------------------------------------------ #
+    model:
+        num_stages: &var_model_num_stages 4 # R
+        num_spks: &var_model_num_spks 2
+        module_audio_enc:
+            in_channels: 1
+            out_channels: &var_model_audio_enc_out_channels 256
+            kernel_size: &var_model_audio_enc_kernel_size 16 # L
+            stride: &var_model_audio_enc_stride 4 # S
+            groups: 1
+            bias: false
+        module_feature_projector:
+            num_channels: *var_model_audio_enc_out_channels
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: &feature_projector_out_channels 128 # F
+            kernel_size: 1
+            bias: false
+        module_separator:
+            num_stages: *var_model_num_stages
+            relative_positional_encoding:
+                in_channels: *feature_projector_out_channels
+                num_heads: 8
+                maxlen: 2000
+                embed_v: false
+            enc_stage:
+                global_blocks:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.05
+                local_blocks:
+                    in_channels: *feature_projector_out_channels
+                    kernel_size: 65
+                    dropout_rate: 0.05
+                down_conv_layer:
+                    in_channels: *feature_projector_out_channels
+                    samp_kernel_size: &var_model_samp_kernel_size 5
+            spk_split_stage:
+                in_channels: *feature_projector_out_channels
+                num_spks: *var_model_num_spks
+            simple_fusion:
+                out_channels: *feature_projector_out_channels
+            dec_stage:
+                num_spks: *var_model_num_spks
+                global_blocks:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.05
+                local_blocks:
+                    in_channels: *feature_projector_out_channels
+                    kernel_size: 65
+                    dropout_rate: 0.05
+                spk_attention:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.05
+        module_output_layer:
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: *feature_projector_out_channels
+            num_spks: *var_model_num_spks
+        module_audio_dec:
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: 1
+            kernel_size: *var_model_audio_enc_kernel_size
+            stride: *var_model_audio_enc_stride
+            bias: false
+    # ------------------------------------------------------------ #
+    criterion:
+        name: ["PIT_SISNR_mag", "PIT_SISNR_time", "PIT_SISNRi", "PIT_SDRi"] ### Choose a torch.nn's loss function class(=attribute) e.g. ["L1Loss", "MSELoss", "CrossEntropyLoss", ...]
+        PIT_SISNR_mag:
+            frame_length: 512
+            frame_shift: 128
+            window: 'hann'
+            num_stages: *var_model_num_stages
+            num_spks: *var_model_num_spks
+            scale_inv: true
+            mel_opt: false
+        PIT_SISNR_time:
+            num_spks: *var_model_num_spks
+            scale_inv: true
+        PIT_SISNRi:
+            num_spks: *var_model_num_spks
+            scale_inv: true
+        PIT_SDRi:
+            dump: 0
+    # ------------------------------------------------------------ #
+    optimizer:
+        name: ["AdamW"] ### Choose a torch.optim's class(=attribute) e.g. ["Adam", "AdamW", "SGD", ...]
+        AdamW:
+            lr: 1.0e-3
+            weight_decay: 1.0e-2
+    # ------------------------------------------------------------ #
+    scheduler:
+        name: ["ReduceLROnPlateau", "WarmupConstantSchedule"] ### Choose a torch.optim.lr_scheduler's class(=attribute) e.g. ["StepLR", "ReduceLROnPlateau", "Custom"]
+        ReduceLROnPlateau:
+            mode: "min"
+            min_lr: 1.0e-10
+            factor: 0.8
+            patience: 2
+        WarmupConstantSchedule:
+            warmup_steps: 1000
+    # ------------------------------------------------------------ #
+    check_computations:
+        dummy_len: 16000
+    # ------------------------------------------------------------ #
+    engine:
+        max_epoch: 200
+        gpuid: "0" ### "0"(single-gpu) or "0, 1" (multi-gpu)
+        mvn: false
+        clip_norm: 5
+        start_scheduling: 50
+        test_epochs: [100, 120, 150, 170]

models/SepReformer/SepReformer_Base_WSJ0/dataset.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import os
+import torch
+import random
+import librosa as audio_lib
+import numpy as np
+from utils import util_dataset
+from utils.decorators import *
+from loguru import logger
+from torch.utils.data import Dataset, DataLoader
+@logger_wraps()
+def get_dataloaders(args, dataset_config, loader_config):
+    # create dataset object for each partition
+    partitions = ["test"] if "test" in args.engine_mode  else ["train", "valid", "test"]
+    dataloaders = {}
+    for partition in partitions:
+        scp_config_mix = os.path.join(dataset_config["scp_dir"], dataset_config[partition]['mixture'])
+        scp_config_spk = [os.path.join(dataset_config["scp_dir"], dataset_config[partition][spk_key]) for spk_key in dataset_config[partition] if spk_key.startswith('spk')]
+        dynamic_mixing = dataset_config[partition]["dynamic_mixing"] if partition == 'train' else False
+        dataset = MyDataset(
+            max_len = dataset_config['max_len'],
+            fs = dataset_config['sampling_rate'],
+            partition = partition,
+            wave_scp_srcs = scp_config_spk,
+            wave_scp_mix = scp_config_mix,
+            dynamic_mixing = dynamic_mixing)
+        dataloader = DataLoader(
+            dataset = dataset,
+            batch_size = 1 if partition == 'test' else loader_config["batch_size"],
+            shuffle = True, # only train: (partition == 'train') / all: True
+            pin_memory = loader_config["pin_memory"],
+            num_workers = loader_config["num_workers"],
+            drop_last = loader_config["drop_last"],
+            collate_fn = _collate)
+        dataloaders[partition] = dataloader
+    return dataloaders
+def _collate(egs):
+    """
+        Transform utterance index into a minbatch
+        Arguments:
+            index: a list type [{},{},{}]
+        Returns:
+            input_sizes: a tensor correspond to utterance length
+            input_feats: packed sequence to feed networks
+            source_attr/target_attr: dictionary contains spectrogram/phase needed in loss computation
+    """
+    def __prepare_target_rir(dict_lsit, index):
+        return torch.nn.utils.rnn.pad_sequence([torch.tensor(d["src"][index], dtype=torch.float32)  for d in dict_lsit], batch_first=True)
+    if type(egs) is not list: raise ValueError("Unsupported index type({})".format(type(egs)))
+    num_spks = 2 # you need to set this paramater by yourself
+    dict_list = sorted([eg for eg in egs], key=lambda x: x['num_sample'], reverse=True)
+    mixture = torch.nn.utils.rnn.pad_sequence([torch.tensor(d['mix'], dtype=torch.float32) for d in dict_list], batch_first=True)
+    src = [__prepare_target_rir(dict_list, index) for index in range(num_spks)]
+    input_sizes = torch.tensor([d['num_sample'] for d in dict_list], dtype=torch.float32)
+    key = [d['key'] for d in dict_list]
+    return input_sizes, mixture, src, key
+@logger_wraps()
+class MyDataset(Dataset):
+    def __init__(self, max_len, fs, partition, wave_scp_srcs, wave_scp_mix, wave_scp_noise=None, dynamic_mixing=False, speed_list=None):
+        self.partition = partition
+        for wave_scp_src in wave_scp_srcs:
+            if not os.path.exists(wave_scp_src): raise FileNotFoundError(f"Could not find file {wave_scp_src}")
+        self.max_len = max_len
+        self.fs = fs
+        self.wave_dict_srcs = [util_dataset.parse_scps(wave_scp_src) for wave_scp_src in wave_scp_srcs]
+        self.wave_dict_mix = util_dataset.parse_scps(wave_scp_mix)
+        self.wave_dict_noise = util_dataset.parse_scps(wave_scp_noise) if wave_scp_noise else None
+        self.wave_keys = list(self.wave_dict_mix.keys())
+        logger.info(f"Create MyDataset for {wave_scp_mix} with {len(self.wave_dict_mix)} utterances")
+        self.dynamic_mixing = dynamic_mixing
+    def __len__(self):
+        return len(self.wave_dict_mix)
+    def __contains__(self, key):
+        return key in self.wave_dict_mix
+    def _dynamic_mixing(self, key):
+        def __match_length(wav, len_data) :
+            leftover = len(wav) - len_data
+            idx = random.randint(0,leftover)
+            wav = wav[idx:idx+len_data]
+            return wav
+        samps_src = []
+        src_len = []
+        # dyanmic source choice
+        # checking whether it is the same speaker
+        while True:
+            key_random = random.choice(list(self.wave_dict_srcs[0].keys()))
+            tmp1 = key.split('_')[1][:3] != key_random.split('_')[3][:3]
+            tmp2 = key.split('_')[3][:3] != key_random.split('_')[1][:3]
+            if tmp1 and tmp2: break
+        idx1, idx2 = (0, 1) if random.random() > 0.5 else (1, 0)
+        files = [self.wave_dict_srcs[idx1][key], self.wave_dict_srcs[idx2][key_random]]
+        # load
+        for file in files:
+            if not os.path.exists(file): raise FileNotFoundError("Input file {} do not exists!".format(file))
+            samps_tmp, _ = audio_lib.load(file, sr=self.fs)
+            # mixing with random gains
+            gain = pow(10,-random.uniform(-2.5,2.5)/20)
+            # Speed Augmentation
+            samps_tmp = np.array(self.speed_aug(torch.tensor(samps_tmp))[0])
+            samps_src.append(gain*samps_tmp)
+            src_len.append(len(samps_tmp))
+        # matching the audio length
+        min_len = min(src_len)
+        samps_src = [__match_length(s, min_len) for s in samps_src]
+        samps_mix = sum(samps_src)
+        # ! truncated along to the sample Length "L"
+        if len(samps_mix)%4 != 0:
+            remains = len(samps_mix)%4
+            samps_mix = samps_mix[:-remains]
+            samps_src = [s[:-remains] for s in samps_src]
+        if self.partition != "test":
+            if len(samps_mix) > self.max_len:
+                start = random.randint(0, len(samps_mix)-self.max_len)
+                samps_mix = samps_mix[start:start+self.max_len]
+                samps_src = [s[start:start+self.max_len] for s in samps_src]
+        return samps_mix, samps_src
+    def _direct_load(self, key):
+        samps_src = []
+        files = [wave_dict_src[key] for wave_dict_src in self.wave_dict_srcs]
+        for file in files:
+            if not os.path.exists(file): raise FileNotFoundError(f"Input file {file} do not exists!")
+            samps_tmp, _ = audio_lib.load(file, sr=self.fs)
+            samps_src.append(samps_tmp)
+        file = self.wave_dict_mix[key]
+        if not os.path.exists(file): raise FileNotFoundError(f"Input file {file} do not exists!")
+        samps_mix, _ = audio_lib.load(file, sr=self.fs)
+        # Truncate samples as needed
+        if len(samps_mix) % 4 != 0:
+            remains = len(samps_mix) % 4
+            samps_mix = samps_mix[:-remains]
+            samps_src = [s[:-remains] for s in samps_src]
+        if self.partition != "test":
+            if len(samps_mix) > self.max_len:
+                start = random.randint(0,len(samps_mix)-self.max_len)
+                samps_mix = samps_mix[start:start+self.max_len]
+                samps_src = [s[start:start+self.max_len] for s in samps_src]
+        return samps_mix, samps_src
+    def __getitem__(self, index):
+        key = self.wave_keys[index]
+        if any(key not in self.wave_dict_srcs[i] for i in range(len(self.wave_dict_srcs))) or key not in self.wave_dict_mix: raise KeyError(f"Could not find utterance {key}")
+        samps_mix, samps_src = self._dynamic_mixing(key) if self.dynamic_mixing else self._direct_load(key)
+        return {"num_sample": samps_mix.shape[0], "mix": samps_mix, "src": samps_src, "key": key}

models/SepReformer/SepReformer_Base_WSJ0/engine.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import os
+import torch
+import csv
+import time
+import soundfile as sf
+import librosa
+from loguru import logger
+from tqdm import tqdm
+from utils import util_engine, functions
+from utils.decorators import *
+from torch.utils.tensorboard import SummaryWriter
+@logger_wraps()
+class Engine(object):
+    def __init__(self, args, config, model, dataloaders, criterions, optimizers, schedulers, gpuid, device):
+        ''' Default setting '''
+        self.engine_mode = args.engine_mode
+        self.out_wav_dir = args.out_wav_dir
+        self.config = config
+        self.gpuid = gpuid
+        self.device = device
+        self.model = model.to(self.device)
+        self.dataloaders = dataloaders # self.dataloaders['train'] or ['valid'] or ['test']
+        self.PIT_SISNR_mag_loss, self.PIT_SISNR_time_loss, self.PIT_SISNRi_loss, self.PIT_SDRi_loss = criterions
+        self.main_optimizer = optimizers[0]
+        self.main_scheduler, self.warmup_scheduler = schedulers
+        self.pretrain_weights_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log", "pretrain_weights")
+        os.makedirs(self.pretrain_weights_path, exist_ok=True)
+        self.scratch_weights_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log", "scratch_weights")
+        os.makedirs(self.scratch_weights_path, exist_ok=True)
+        self.checkpoint_path = self.pretrain_weights_path if any(file.endswith(('.pt', '.pt', '.pkl')) for file in os.listdir(self.pretrain_weights_path)) else self.scratch_weights_path
+        self.start_epoch = util_engine.load_last_checkpoint_n_get_epoch(self.checkpoint_path, self.model, self.main_optimizer, location=self.device)
+        # Logging
+        util_engine.model_params_mac_summary(
+            model=self.model,
+            input=torch.randn(1, self.config['check_computations']['dummy_len']).to(self.device),
+            dummy_input=torch.rand(1, self.config['check_computations']['dummy_len']).to(self.device),
+            metrics=['ptflops', 'thop', 'torchinfo']
+            # metrics=['ptflops']
+        )
+        logger.info(f"Clip gradient by 2-norm {self.config['engine']['clip_norm']}")
+    @logger_wraps()
+    def _train(self, dataloader, epoch):
+        self.model.train()
+        tot_loss_freq = [0 for _ in range(self.model.num_stages)]
+        tot_loss_time, num_batch = 0, 0
+        pbar = tqdm(total=len(dataloader), unit='batches', bar_format='{l_bar}{bar:25}{r_bar}{bar:-10b}', colour="YELLOW", dynamic_ncols=True)
+        for input_sizes, mixture, src, _ in dataloader:
+            nnet_input = mixture
+            nnet_input = functions.apply_cmvn(nnet_input) if self.config['engine']['mvn'] else nnet_input
+            num_batch += 1
+            pbar.update(1)
+            # Scheduler learning rate for warm-up (Iteration-based update for transformers)
+            if epoch == 1: self.warmup_scheduler.step()
+            nnet_input = nnet_input.to(self.device)
+            self.main_optimizer.zero_grad()
+            estim_src, estim_src_bn = torch.nn.parallel.data_parallel(self.model, nnet_input, device_ids=self.gpuid)
+            cur_loss_s_bn = 0
+            cur_loss_s_bn = []
+            for idx, estim_src_value in enumerate(estim_src_bn):
+                cur_loss_s_bn.append(self.PIT_SISNR_mag_loss(estims=estim_src_value, idx=idx, input_sizes=input_sizes, target_attr=src))
+                tot_loss_freq[idx] += cur_loss_s_bn[idx].item() / (self.config['model']['num_spks'])
+            cur_loss_s = self.PIT_SISNR_time_loss(estims=estim_src, input_sizes=input_sizes, target_attr=src)
+            tot_loss_time += cur_loss_s.item() / self.config['model']['num_spks']
+            alpha = 0.4 * 0.8**(1+(epoch-101)//5) if epoch > 100 else 0.4
+            cur_loss = (1-alpha) * cur_loss_s + alpha * sum(cur_loss_s_bn) / len(cur_loss_s_bn)
+            cur_loss = cur_loss / self.config['model']['num_spks']
+            cur_loss.backward()
+            if self.config['engine']['clip_norm']: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config['engine']['clip_norm'])
+            self.main_optimizer.step()
+            dict_loss = {"T_Loss": tot_loss_time / num_batch}
+            dict_loss.update({'F_Loss_' + str(idx): loss / num_batch for idx, loss in enumerate(tot_loss_freq)})
+            pbar.set_postfix(dict_loss)
+        pbar.close()
+        tot_loss_freq = sum(tot_loss_freq) / len(tot_loss_freq)
+        return tot_loss_time / num_batch, tot_loss_freq / num_batch, num_batch
+    @logger_wraps()
+    def _validate(self, dataloader):
+        self.model.eval()
+        tot_loss_freq = [0 for _ in range(self.model.num_stages)]
+        tot_loss_time, num_batch = 0, 0
+        pbar = tqdm(total=len(dataloader), unit='batches', bar_format='{l_bar}{bar:5}{r_bar}{bar:-10b}', colour="RED", dynamic_ncols=True)
+        with torch.inference_mode():
+            for input_sizes, mixture, src, _ in dataloader:
+                nnet_input = mixture
+                nnet_input = functions.apply_cmvn(nnet_input) if self.config['engine']['mvn'] else nnet_input
+                nnet_input = nnet_input.to(self.device)
+                num_batch += 1
+                pbar.update(1)
+                estim_src, estim_src_bn = torch.nn.parallel.data_parallel(self.model, nnet_input, device_ids=self.gpuid)
+                cur_loss_s_bn = []
+                for idx, estim_src_value in enumerate(estim_src_bn):
+                    cur_loss_s_bn.append(self.PIT_SISNR_mag_loss(estims=estim_src_value, idx=idx, input_sizes=input_sizes, target_attr=src))
+                    tot_loss_freq[idx] += cur_loss_s_bn[idx].item() / (self.config['model']['num_spks'])
+                cur_loss_s_SDR = self.PIT_SISNR_time_loss(estims=estim_src, input_sizes=input_sizes, target_attr=src)
+                tot_loss_time += cur_loss_s_SDR.item() / self.config['model']['num_spks']
+                dict_loss = {"T_Loss":tot_loss_time / num_batch}
+                dict_loss.update({'F_Loss_' + str(idx): loss / num_batch for idx, loss in enumerate(tot_loss_freq)})
+                pbar.set_postfix(dict_loss)
+        pbar.close()
+        tot_loss_freq = sum(tot_loss_freq) / len(tot_loss_freq)
+        return tot_loss_time / num_batch, tot_loss_freq / num_batch, num_batch
+    @logger_wraps()
+    def _test(self, dataloader, wav_dir=None):
+        self.model.eval()
+        total_loss_SISNRi, total_loss_SDRi, num_batch = 0, 0, 0
+        pbar = tqdm(total=len(dataloader), unit='batches', bar_format='{l_bar}{bar:5}{r_bar}{bar:-10b}', colour="grey", dynamic_ncols=True)
+        with torch.inference_mode():
+            csv_file_name_sisnr = os.path.join(os.path.dirname(__file__),'test_SISNRi_value.csv')
+            csv_file_name_sdr = os.path.join(os.path.dirname(__file__),'test_SDRi_value.csv')
+            with open(csv_file_name_sisnr, 'w', newline='') as csvfile_sisnr, open(csv_file_name_sdr, 'w', newline='') as csvfile_sdr:
+                idx = 0
+                writer_sisnr = csv.writer(csvfile_sisnr, quotechar='|', quoting=csv.QUOTE_MINIMAL)
+                writer_sdr = csv.writer(csvfile_sdr, quotechar='|', quoting=csv.QUOTE_MINIMAL)
+                for input_sizes, mixture, src, key in dataloader:
+                    if len(key) > 1:
+                        raise("batch size is not one!!")
+                    nnet_input = mixture.to(self.device)
+                    num_batch += 1
+                    pbar.update(1)
+                    estim_src, _ = torch.nn.parallel.data_parallel(self.model, nnet_input, device_ids=self.gpuid)
+                    cur_loss_SISNRi, cur_loss_SISNRi_src = self.PIT_SISNRi_loss(estims=estim_src, mixture=mixture, input_sizes=input_sizes, target_attr=src, eps=1.0e-15)
+                    total_loss_SISNRi += cur_loss_SISNRi.item() / self.config['model']['num_spks']
+                    cur_loss_SDRi, cur_loss_SDRi_src = self.PIT_SDRi_loss(estims=estim_src, mixture=mixture, input_sizes=input_sizes, target_attr=src)
+                    total_loss_SDRi += cur_loss_SDRi.item() / self.config['model']['num_spks']
+                    writer_sisnr.writerow([key[0][:-4]] + [cur_loss_SISNRi_src[i].item() for i in range(self.config['model']['num_spks'])])
+                    writer_sdr.writerow([key[0][:-4]] + [cur_loss_SDRi_src[i].item() for i in range(self.config['model']['num_spks'])])
+                    if self.engine_mode == "test_save":
+                        if wav_dir == None: wav_dir = os.path.join(os.path.dirname(__file__),"wav_out")
+                        if wav_dir and not os.path.exists(wav_dir): os.makedirs(wav_dir)
+                        mixture = torch.squeeze(mixture).cpu().data.numpy()
+                        sf.write(os.path.join(wav_dir,key[0][:-4]+str(idx)+'_mixture.wav'), 0.5*mixture/max(abs(mixture)), 8000)
+                        for i in range(self.config['model']['num_spks']):
+                            src = torch.squeeze(estim_src[i]).cpu().data.numpy()
+                            sf.write(os.path.join(wav_dir,key[0][:-4]+str(idx)+'_out_'+str(i)+'.wav'), 0.5*src/max(abs(src)), 8000)
+                    idx += 1
+                    dict_loss = {"SiSNRi": total_loss_SISNRi/num_batch, "SDRi": total_loss_SDRi/num_batch}
+                    pbar.set_postfix(dict_loss)
+        pbar.close()
+        return total_loss_SISNRi/num_batch, total_loss_SDRi/num_batch, num_batch
+    @logger_wraps()
+    def _inference_sample(self, sample):
+        self.model.eval()
+        self.fs = self.config["dataset"]["sampling_rate"]
+        mixture, _ = librosa.load(sample,sr=self.fs)
+        mixture = torch.tensor(mixture, dtype=torch.float32)[None]
+        self.stride = self.config["model"]["module_audio_enc"]["stride"]
+        remains = mixture.shape[-1] % self.stride
+        if remains != 0:
+            padding = self.stride - remains
+            mixture_padded = torch.nn.functional.pad(mixture, (0, padding), "constant", 0)
+        else:
+            mixture_padded = mixture
+        with torch.inference_mode():
+            nnet_input = mixture_padded.to(self.device)
+            estim_src, _ = torch.nn.parallel.data_parallel(self.model, nnet_input, device_ids=self.gpuid)
+            mixture = torch.squeeze(mixture).cpu().numpy()
+            sf.write(sample[:-4]+'_in.wav', 0.9*mixture/max(abs(mixture)), self.fs)
+            for i in range(self.config['model']['num_spks']):
+                src = torch.squeeze(estim_src[i][...,:mixture.shape[-1]]).cpu().data.numpy()
+                sf.write(sample[:-4]+'_out_'+str(i)+'.wav', 0.9*src/max(abs(src)), self.fs)
+    @logger_wraps()
+    def run(self):
+        with torch.cuda.device(self.device):
+            writer_src = SummaryWriter(os.path.join(os.path.dirname(os.path.abspath(__file__)), "log/tensorboard"))
+            if "test" in self.engine_mode:
+                on_test_start = time.time()
+                test_loss_src_time_1, test_loss_src_time_2, test_num_batch = self._test(self.dataloaders['test'], self.out_wav_dir)
+                on_test_end = time.time()
+                logger.info(f"[TEST] Loss(time/mini-batch) \n - Epoch {self.start_epoch:2d}: SISNRi = {test_loss_src_time_1:.4f} dB | SDRi = {test_loss_src_time_2:.4f} dB | Speed = ({on_test_end - on_test_start:.2f}s/{test_num_batch:d})")
+                logger.info(f"Testing done!")
+            else:
+                start_time = time.time()
+                if self.start_epoch > 1:
+                    init_loss_time, init_loss_freq, valid_num_batch = self._validate(self.dataloaders['valid'])
+                else:
+                    init_loss_time, init_loss_freq = 0, 0
+                end_time = time.time()
+                logger.info(f"[INIT] Loss(time/mini-batch) \n - Epoch {self.start_epoch:2d}: Loss_t = {init_loss_time:.4f} dB | Loss_f = {init_loss_freq:.4f} dB | Speed = ({end_time-start_time:.2f}s)")
+                for epoch in range(self.start_epoch, self.config['engine']['max_epoch']):
+                    valid_loss_best = init_loss_time
+                    train_start_time = time.time()
+                    train_loss_src_time, train_loss_src_freq, train_num_batch = self._train(self.dataloaders['train'], epoch)
+                    train_end_time = time.time()
+                    valid_start_time = time.time()
+                    valid_loss_src_time, valid_loss_src_freq, valid_num_batch = self._validate(self.dataloaders['valid'])
+                    valid_end_time = time.time()
+                    if epoch > self.config['engine']['start_scheduling']: self.main_scheduler.step(valid_loss_src_time)
+                    logger.info(f"[TRAIN] Loss(time/mini-batch) \n - Epoch {epoch:2d}: Loss_t = {train_loss_src_time:.4f} dB | Loss_f = {train_loss_src_freq:.4f} dB | Speed = ({train_end_time - train_start_time:.2f}s/{train_num_batch:d})")
+                    logger.info(f"[VALID] Loss(time/mini-batch) \n - Epoch {epoch:2d}: Loss_t = {valid_loss_src_time:.4f} dB | Loss_f = {valid_loss_src_freq:.4f} dB | Speed = ({valid_end_time - valid_start_time:.2f}s/{valid_num_batch:d})")
+                    if epoch in self.config['engine']['test_epochs']:
+                        on_test_start = time.time()
+                        test_loss_src_time_1, test_loss_src_time_2, test_num_batch = self._test(self.dataloaders['test'])
+                        on_test_end = time.time()
+                        logger.info(f"[TEST] Loss(time/mini-batch) \n - Epoch {epoch:2d}: SISNRi = {test_loss_src_time_1:.4f} dB | SDRi = {test_loss_src_time_2:.4f} dB | Speed = ({on_test_end - on_test_start:.2f}s/{test_num_batch:d})")
+                    valid_loss_best = util_engine.save_checkpoint_per_best(valid_loss_best, valid_loss_src_time, train_loss_src_time, epoch, self.model, self.main_optimizer, self.checkpoint_path, self.wandb_run)
+                    # Logging to monitoring tools (Tensorboard && Wandb)
+                    writer_src.add_scalars("Metrics", {
+                        'Loss_train_time': train_loss_src_time,
+                        'Loss_valid_time': valid_loss_src_time}, epoch)
+                    writer_src.add_scalars("Learning Rate", self.main_optimizer.param_groups[0]['lr'], epoch)
+                    writer_src.flush()
+                logger.info(f"Training for {self.config['engine']['max_epoch']} epoches done!")

models/SepReformer/SepReformer_Base_WSJ0/log/scratch_weights/epoch.0180.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14569febefb19900026a350c7b31ca6a927ce4bac7fa83269902d8c6437f0d11
+size 134

models/SepReformer/SepReformer_Base_WSJ0/main.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+import torch
+from loguru import logger
+from .dataset import get_dataloaders
+from .model import Model
+from .engine import Engine
+from utils import util_system, util_implement
+from utils.decorators import *
+# Setup logger
+log_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log/system_log.log")
+logger.add(log_file_path, level="DEBUG", mode="w")
+@logger_wraps()
+def main(args):
+    ''' Build Setting '''
+    # Call configuration file (configs.yaml)
+    yaml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs.yaml")
+    yaml_dict = util_system.parse_yaml(yaml_path)
+    # Run wandb and get configuration
+    config = yaml_dict["config"] # wandb login success or fail
+    # Call DataLoader [train / valid / test / etc...]
+    dataloaders = get_dataloaders(args, config["dataset"], config["dataloader"])
+    ''' Build Model '''
+    # Call network model
+    model = Model(**config["model"])
+    ''' Build Engine '''
+    # Call gpu id & device
+    gpuid = tuple(map(int, config["engine"]["gpuid"].split(',')))
+    device = torch.device(f'cuda:{gpuid[0]}')
+    # Call Implement [criterion / optimizer / scheduler]
+    criterions = util_implement.CriterionFactory(config["criterion"], device).get_criterions()
+    optimizers = util_implement.OptimizerFactory(config["optimizer"], model.parameters()).get_optimizers()
+    schedulers = util_implement.SchedulerFactory(config["scheduler"], optimizers).get_schedulers()
+    # Call & Run Engine
+    engine = Engine(args, config, model, dataloaders, criterions, optimizers, schedulers, gpuid, device)
+    if args.engine_mode == 'infer_sample':
+        engine._inference_sample(args.sample_file)
+    else:
+        engine.run()

models/SepReformer/SepReformer_Base_WSJ0/model.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import sys
+sys.path.append('../')
+import torch
+import warnings
+warnings.filterwarnings('ignore')
+from utils.decorators import *
+from .modules.module import *
+@logger_wraps()
+class Model(torch.nn.Module):
+    def __init__(self,
+                 num_stages: int,
+                 num_spks: int,
+                 module_audio_enc: dict,
+                 module_feature_projector: dict,
+                 module_separator: dict,
+                 module_output_layer: dict,
+                 module_audio_dec: dict):
+        super().__init__()
+        self.num_stages = num_stages
+        self.num_spks = num_spks
+        self.audio_encoder = AudioEncoder(**module_audio_enc)
+        self.feature_projector = FeatureProjector(**module_feature_projector)
+        self.separator = Separator(**module_separator)
+        self.out_layer = OutputLayer(**module_output_layer)
+        self.audio_decoder = AudioDecoder(**module_audio_dec)
+        # Aux_loss
+        self.out_layer_bn = torch.nn.ModuleList([])
+        self.decoder_bn = torch.nn.ModuleList([])
+        for _ in range(self.num_stages):
+            self.out_layer_bn.append(OutputLayer(**module_output_layer, masking=True))
+            self.decoder_bn.append(AudioDecoder(**module_audio_dec))
+    def forward(self, x):
+        encoder_output = self.audio_encoder(x)
+        projected_feature = self.feature_projector(encoder_output)
+        last_stage_output, each_stage_outputs = self.separator(projected_feature)
+        out_layer_output = self.out_layer(last_stage_output, encoder_output)
+        each_spk_output = [out_layer_output[idx] for idx in range(self.num_spks)]
+        audio = [self.audio_decoder(each_spk_output[idx]) for idx in range(self.num_spks)]
+        # Aux_loss
+        audio_aux = []
+        for idx, each_stage_output in enumerate(each_stage_outputs):
+            each_stage_output = self.out_layer_bn[idx](torch.nn.functional.upsample(each_stage_output, encoder_output.shape[-1]), encoder_output)
+            out_aux = [each_stage_output[jdx] for jdx in range(self.num_spks)]
+            audio_aux.append([self.decoder_bn[idx](out_aux[jdx])[...,:x.shape[-1]] for jdx in range(self.num_spks)])
+        return audio, audio_aux

models/SepReformer/SepReformer_Base_WSJ0/modules/module.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import sys
+sys.path.append('../')
+import torch
+import warnings
+warnings.filterwarnings('ignore')
+from utils.decorators import *
+from .network import *
+class AudioEncoder(torch.nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int, groups: int, bias: bool):
+        super().__init__()
+        self.conv1d = torch.nn.Conv1d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, groups=groups, bias=bias)
+        self.gelu = torch.nn.GELU()
+    def forward(self, x: torch.Tensor):
+        x = torch.unsqueeze(x, dim=0) if len(x.shape) == 1 else torch.unsqueeze(x, dim=1) # [T] - >[1, T] OR [B, T] -> [B, 1, T]
+        x = self.conv1d(x)
+        x = self.gelu(x)
+        return x
+class FeatureProjector(torch.nn.Module):
+    def __init__(self, num_channels: int, in_channels: int, out_channels: int, kernel_size: int, bias: bool):
+        super().__init__()
+        self.norm = torch.nn.GroupNorm(num_groups=1, num_channels=num_channels, eps=1e-8)
+        self.conv1d = torch.nn.Conv1d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, bias=bias)
+    def forward(self, x: torch.Tensor):
+        x = self.norm(x)
+        x = self.conv1d(x)
+        return x
+class Separator(torch.nn.Module):
+    def __init__(self, num_stages: int, relative_positional_encoding: dict, enc_stage: dict, spk_split_stage: dict, simple_fusion:dict, dec_stage: dict):
+        super().__init__()
+        class RelativePositionalEncoding(torch.nn.Module):
+            def __init__(self, in_channels: int, num_heads: int, maxlen: int, embed_v=False):
+                super().__init__()
+                self.in_channels = in_channels
+                self.num_heads = num_heads
+                self.embedding_dim = self.in_channels // self.num_heads
+                self.maxlen = maxlen
+                self.pe_k = torch.nn.Embedding(num_embeddings=2*maxlen, embedding_dim=self.embedding_dim)
+                self.pe_v = torch.nn.Embedding(num_embeddings=2*maxlen, embedding_dim=self.embedding_dim) if embed_v else None
+            def forward(self, pos_seq: torch.Tensor):
+                pos_seq.clamp_(-self.maxlen, self.maxlen - 1)
+                pos_seq += self.maxlen
+                pe_k_output = self.pe_k(pos_seq)
+                pe_v_output = self.pe_v(pos_seq) if self.pe_v is not None else None
+                return pe_k_output, pe_v_output
+        class SepEncStage(torch.nn.Module):
+            def __init__(self, global_blocks: dict, local_blocks: dict, down_conv_layer: dict, down_conv=True):
+                super().__init__()
+                class DownConvLayer(torch.nn.Module):
+                    def __init__(self, in_channels: int, samp_kernel_size: int):
+                        """Construct an EncoderLayer object."""
+                        super().__init__()
+                        self.down_conv = torch.nn.Conv1d(
+                            in_channels=in_channels, out_channels=in_channels, kernel_size=samp_kernel_size, stride=2, padding=(samp_kernel_size-1)//2, groups=in_channels)
+                        self.BN = torch.nn.BatchNorm1d(num_features=in_channels)
+                        self.gelu = torch.nn.GELU()
+                    def forward(self, x: torch.Tensor):
+                        x = x.permute([0, 2, 1])
+                        x = self.down_conv(x)
+                        x = self.BN(x)
+                        x = self.gelu(x)
+                        x = x.permute([0, 2, 1])
+                        return x
+                self.g_block_1 = GlobalBlock(**global_blocks)
+                self.l_block_1 = LocalBlock(**local_blocks)
+                self.g_block_2 = GlobalBlock(**global_blocks)
+                self.l_block_2 = LocalBlock(**local_blocks)
+                self.downconv = DownConvLayer(**down_conv_layer) if down_conv == True else None
+            def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+                '''
+                x: [B, N, T]
+                '''
+                x = self.g_block_1(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_1(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.g_block_2(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_2(x)
+                x = x.permute(0, 2, 1).contiguous()
+                skip = x
+                if self.downconv:
+                    x = x.permute(0, 2, 1).contiguous()
+                    x = self.downconv(x)
+                    x = x.permute(0, 2, 1).contiguous()
+                # [BK, S, N]
+                return x, skip
+        class SpkSplitStage(torch.nn.Module):
+            def __init__(self, in_channels: int, num_spks: int):
+                super().__init__()
+                self.linear = torch.nn.Sequential(
+                    torch.nn.Conv1d(in_channels, 4*in_channels*num_spks, kernel_size=1),
+                    torch.nn.GLU(dim=-2),
+                    torch.nn.Conv1d(2*in_channels*num_spks, in_channels*num_spks, kernel_size=1))
+                self.norm = torch.nn.GroupNorm(1, in_channels, eps=1e-8)
+                self.num_spks = num_spks
+            def forward(self, x: torch.Tensor):
+                x = self.linear(x)
+                B, _, T = x.shape
+                x = x.view(B*self.num_spks,-1, T).contiguous()
+                x = self.norm(x)
+                return x
+        class SepDecStage(torch.nn.Module):
+            def __init__(self, num_spks: int, global_blocks: dict, local_blocks: dict, spk_attention: dict):
+                super().__init__()
+                self.g_block_1 = GlobalBlock(**global_blocks)
+                self.l_block_1 = LocalBlock(**local_blocks)
+                self.spk_attn_1 = SpkAttention(**spk_attention)
+                self.g_block_2 = GlobalBlock(**global_blocks)
+                self.l_block_2 = LocalBlock(**local_blocks)
+                self.spk_attn_2 = SpkAttention(**spk_attention)
+                self.g_block_3 = GlobalBlock(**global_blocks)
+                self.l_block_3 = LocalBlock(**local_blocks)
+                self.spk_attn_3 = SpkAttention(**spk_attention)
+                self.num_spk = num_spks
+            def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+                '''
+                x: [B, N, T]
+                '''
+                # [BS, K, H]
+                x = self.g_block_1(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_1(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.spk_attn_1(x, self.num_spk)
+                x = self.g_block_2(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_2(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.spk_attn_2(x, self.num_spk)
+                x = self.g_block_3(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_3(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.spk_attn_3(x, self.num_spk)
+                skip = x
+                return x, skip
+        self.num_stages = num_stages
+        self.pos_emb = RelativePositionalEncoding(**relative_positional_encoding)
+        # Temporal Contracting Part
+        self.enc_stages = torch.nn.ModuleList([])
+        for _ in range(self.num_stages):
+            self.enc_stages.append(SepEncStage(**enc_stage, down_conv=True))
+        self.bottleneck_G = SepEncStage(**enc_stage, down_conv=False)
+        self.spk_split_block = SpkSplitStage(**spk_split_stage)
+        # Temporal Expanding Part
+        self.simple_fusion = torch.nn.ModuleList([])
+        self.dec_stages = torch.nn.ModuleList([])
+        for _ in range(self.num_stages):
+            self.simple_fusion.append(torch.nn.Conv1d(in_channels=simple_fusion['out_channels']*2,out_channels=simple_fusion['out_channels'], kernel_size=1))
+            self.dec_stages.append(SepDecStage(**dec_stage))
+    def forward(self, input: torch.Tensor):
+        '''input: [B, N, L]'''
+        # feature projection
+        x, _ = self.pad_signal(input)
+        len_x = x.shape[-1]
+        # Temporal Contracting Part
+        pos_seq = torch.arange(0, len_x//2**self.num_stages).long().to(x.device)
+        pos_seq = pos_seq[:, None] - pos_seq[None, :]
+        pos_k, _ = self.pos_emb(pos_seq)
+        skip = []
+        for idx in range(self.num_stages):
+            x, skip_ = self.enc_stages[idx](x, pos_k)
+            skip_ = self.spk_split_block(skip_)
+            skip.append(skip_)
+        x, _ = self.bottleneck_G(x, pos_k)
+        x = self.spk_split_block(x) # B, 2F, T
+        each_stage_outputs = []
+        # Temporal Expanding Part
+        for idx in range(self.num_stages):
+            each_stage_outputs.append(x)
+            idx_en = self.num_stages - (idx + 1)
+            x = torch.nn.functional.upsample(x, skip[idx_en].shape[-1])
+            x = torch.cat([x,skip[idx_en]],dim=1)
+            x = self.simple_fusion[idx](x)
+            x, _ = self.dec_stages[idx](x, pos_k)
+        last_stage_output = x
+        return last_stage_output, each_stage_outputs
+    def pad_signal(self, input: torch.Tensor):
+        #  (B, T) or (B, 1, T)
+        if input.dim() == 1: input = input.unsqueeze(0)
+        elif input.dim() not in [2, 3]: raise RuntimeError("Input can only be 2 or 3 dimensional.")
+        elif input.dim() == 2: input = input.unsqueeze(1)
+        L = 2**self.num_stages
+        batch_size = input.size(0)
+        ndim = input.size(1)
+        nframe = input.size(2)
+        padded_len = (nframe//L + 1)*L
+        rest = 0 if nframe%L == 0 else padded_len - nframe
+        if rest > 0:
+            pad = torch.autograd.Variable(torch.zeros(batch_size, ndim, rest)).type(input.type()).to(input.device)
+            input = torch.cat([input, pad], dim=-1)
+        return input, rest
+class OutputLayer(torch.nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, num_spks: int, masking: bool = False):
+        super().__init__()
+        # feature expansion back
+        self.masking = masking
+        self.spe_block = Masking(in_channels, Activation_mask="ReLU", concat_opt=None)
+        self.num_spks = num_spks
+        self.end_conv1x1 = torch.nn.Sequential(
+            torch.nn.Linear(out_channels, 4*out_channels),
+            torch.nn.GLU(),
+            torch.nn.Linear(2*out_channels, in_channels))
+    def forward(self, x: torch.Tensor, input: torch.Tensor):
+        x = x[...,:input.shape[-1]]
+        x = x.permute([0, 2, 1])
+        x = self.end_conv1x1(x)
+        x = x.permute([0, 2, 1])
+        B, N, L = x.shape
+        B = B // self.num_spks
+        if self.masking:
+            input = input.expand(self.num_spks, B, N, L).transpose(0,1).contiguous()
+            input = input.view(B*self.num_spks, N, L)
+            x = self.spe_block(x, input)
+        x = x.view(B, self.num_spks, N, L)
+        # [spks, B, N, L]
+        x = x.transpose(0, 1)
+        return x
+class AudioDecoder(torch.nn.ConvTranspose1d):
+    '''
+        Decoder of the TasNet
+        This module can be seen as the gradient of Conv1d with respect to its input.
+        It is also known as a fractionally-strided convolution
+        or a deconvolution (although it is not an actual deconvolution operation).
+    '''
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x):
+        # x: [B, N, L]
+        if x.dim() not in [2, 3]: raise RuntimeError("{} accept 3/4D tensor as input".format(self.__name__))
+        x = super().forward(x if x.dim() == 3 else torch.unsqueeze(x, 1))
+        x = torch.squeeze(x, dim=1) if torch.squeeze(x).dim() == 1 else torch.squeeze(x)
+        return x

models/SepReformer/SepReformer_Base_WSJ0/modules/network.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import torch
+import math
+import numpy
+from utils.decorators import *
+class LayerScale(torch.nn.Module):
+    def __init__(self, dims, input_size, Layer_scale_init=1.0e-5):
+        super().__init__()
+        if dims == 1:
+            self.layer_scale = torch.nn.Parameter(torch.ones(input_size)*Layer_scale_init, requires_grad=True)
+        elif dims == 2:
+            self.layer_scale = torch.nn.Parameter(torch.ones(1,input_size)*Layer_scale_init, requires_grad=True)
+        elif dims == 3:
+            self.layer_scale = torch.nn.Parameter(torch.ones(1,1,input_size)*Layer_scale_init, requires_grad=True)
+    def forward(self, x):
+        return x*self.layer_scale
+class Masking(torch.nn.Module):
+    def __init__(self, input_dim, Activation_mask='Sigmoid', **options):
+        super(Masking, self).__init__()
+        self.options = options
+        if self.options['concat_opt']:
+            self.pw_conv = torch.nn.Conv1d(input_dim*2, input_dim, 1, stride=1, padding=0)
+        if Activation_mask == 'Sigmoid':
+            self.gate_act = torch.nn.Sigmoid()
+        elif Activation_mask == 'ReLU':
+            self.gate_act = torch.nn.ReLU()
+    def forward(self, x, skip):
+        if self.options['concat_opt']:
+            y = torch.cat([x, skip], dim=-2)
+            y = self.pw_conv(y)
+        else:
+            y = x
+        y = self.gate_act(y) * skip
+        return y
+class GCFN(torch.nn.Module):
+    def __init__(self, in_channels, dropout_rate, Layer_scale_init=1.0e-5):
+        super().__init__()
+        self.net1 = torch.nn.Sequential(
+            torch.nn.LayerNorm(in_channels),
+            torch.nn.Linear(in_channels, in_channels*6))
+        self.depthwise = torch.nn.Conv1d(in_channels*6, in_channels*6, 3, padding=1, groups=in_channels*6)
+        self.net2 = torch.nn.Sequential(
+            torch.nn.GLU(),
+            torch.nn.Dropout(dropout_rate),
+            torch.nn.Linear(in_channels*3, in_channels),
+            torch.nn.Dropout(dropout_rate))
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x):
+        y = self.net1(x)
+        y = y.permute(0, 2, 1).contiguous()
+        y = self.depthwise(y)
+        y = y.permute(0, 2, 1).contiguous()
+        y = self.net2(y)
+        return x + self.Layer_scale(y)
+class MultiHeadAttention(torch.nn.Module):
+    """
+    Multi-Head Attention layer.
+        :param int n_head: the number of head s
+        :param int n_feat: the number of features
+        :param float dropout_rate: dropout rate
+    """
+    def __init__(self, n_head: int, in_channels: int, dropout_rate: float, Layer_scale_init=1.0e-5):
+        super().__init__()
+        assert in_channels % n_head == 0
+        self.d_k = in_channels // n_head # We assume d_v always equals d_k
+        self.h = n_head
+        self.layer_norm = torch.nn.LayerNorm(in_channels)
+        self.linear_q = torch.nn.Linear(in_channels, in_channels)
+        self.linear_k = torch.nn.Linear(in_channels, in_channels)
+        self.linear_v = torch.nn.Linear(in_channels, in_channels)
+        self.linear_out = torch.nn.Linear(in_channels, in_channels)
+        self.attn = None
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x, pos_k, mask):
+        """
+        Compute 'Scaled Dot Product Attention'.
+            :param torch.Tensor mask: (batch, time1, time2)
+            :param torch.nn.Dropout dropout:
+            :return torch.Tensor: attentined and transformed `value` (batch, time1, d_model)
+            weighted by the query dot key attention (batch, head, time1, time2)
+        """
+        n_batch = x.size(0)
+        x = self.layer_norm(x)
+        q = self.linear_q(x).view(n_batch, -1, self.h, self.d_k)  #(b, t, d)
+        k = self.linear_k(x).view(n_batch, -1, self.h, self.d_k)  #(b, t, d)
+        v = self.linear_v(x).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        A = torch.matmul(q, k.transpose(-2, -1))
+        reshape_q = q.contiguous().view(n_batch * self.h, -1, self.d_k).transpose(0,1)
+        if pos_k is not None:
+            B = torch.matmul(reshape_q, pos_k.transpose(-2, -1))
+            B = B.transpose(0, 1).view(n_batch, self.h, pos_k.size(0), pos_k.size(1))
+            scores = (A + B) / math.sqrt(self.d_k)
+        else:
+            scores = A / math.sqrt(self.d_k)
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
+            min_value = float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
+            scores = scores.masked_fill(mask, min_value)
+            self.attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0)  # (batch, head, time1, time2)
+        else:
+            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(self.attn)
+        x = torch.matmul(p_attn, v)  # (batch, head, time1, d_k)
+        x = x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+        return self.Layer_scale(self.dropout(self.linear_out(x)))  # (batch, time1, d_model)
+class EGA(torch.nn.Module):
+    def __init__(self, in_channels: int, num_mha_heads: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'self_attn': MultiHeadAttention(
+                n_head=num_mha_heads, in_channels=in_channels, dropout_rate=dropout_rate),
+            'linear': torch.nn.Sequential(
+                torch.nn.LayerNorm(normalized_shape=in_channels),
+                torch.nn.Linear(in_features=in_channels, out_features=in_channels),
+                torch.nn.Sigmoid())
+        })
+    def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+        """
+        Compute encoded features.
+            :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+            :param torch.Tensor mask: mask for x (batch, max_time_in)
+            :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        down_len = pos_k.shape[0]
+        x_down = torch.nn.functional.adaptive_avg_pool1d(input=x, output_size=down_len)
+        x = x.permute([0, 2, 1])
+        x_down = x_down.permute([0, 2, 1])
+        x_down = self.block['self_attn'](x_down, pos_k, None)
+        x_down = x_down.permute([0, 2, 1])
+        x_downup = torch.nn.functional.upsample(input=x_down, size=x.shape[1])
+        x_downup = x_downup.permute([0, 2, 1])
+        x = x + self.block['linear'](x) * x_downup
+        return x
+class CLA(torch.nn.Module):
+    def __init__(self, in_channels, kernel_size, dropout_rate, Layer_scale_init=1.0e-5):
+        super().__init__()
+        self.layer_norm = torch.nn.LayerNorm(in_channels)
+        self.linear1 = torch.nn.Linear(in_channels, in_channels*2)
+        self.GLU = torch.nn.GLU()
+        self.dw_conv_1d = torch.nn.Conv1d(in_channels, in_channels, kernel_size, padding='same', groups=in_channels)
+        self.linear2 = torch.nn.Linear(in_channels, 2*in_channels)
+        self.BN = torch.nn.BatchNorm1d(2*in_channels)
+        self.linear3 = torch.nn.Sequential(
+            torch.nn.GELU(),
+            torch.nn.Linear(2*in_channels, in_channels),
+            torch.nn.Dropout(dropout_rate))
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x):
+        y = self.layer_norm(x)
+        y = self.linear1(y)
+        y = self.GLU(y)
+        y = y.permute([0, 2, 1]) # B, F, T
+        y = self.dw_conv_1d(y)
+        y = y.permute(0, 2, 1) # B, T, 2F
+        y = self.linear2(y)
+        y = y.permute(0, 2, 1) # B, T, 2F
+        y = self.BN(y)
+        y = y.permute(0, 2, 1) # B, T, 2F
+        y = self.linear3(y)
+        return x + self.Layer_scale(y)
+class GlobalBlock(torch.nn.Module):
+    def __init__(self, in_channels: int, num_mha_heads: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'ega': EGA(
+                num_mha_heads=num_mha_heads, in_channels=in_channels, dropout_rate=dropout_rate),
+            'gcfn': GCFN(in_channels=in_channels, dropout_rate=dropout_rate)
+        })
+    def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+        """
+        Compute encoded features.
+            :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+            :param torch.Tensor mask: mask for x (batch, max_time_in)
+            :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        x = self.block['ega'](x, pos_k)
+        x = self.block['gcfn'](x)
+        x = x.permute([0, 2, 1])
+        return x
+class LocalBlock(torch.nn.Module):
+    def __init__(self, in_channels: int, kernel_size: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'cla': CLA(in_channels, kernel_size, dropout_rate),
+            'gcfn': GCFN(in_channels, dropout_rate)
+        })
+    def forward(self, x: torch.Tensor):
+        x = self.block['cla'](x)
+        x = self.block['gcfn'](x)
+        return x
+class SpkAttention(torch.nn.Module):
+    def __init__(self, in_channels: int, num_mha_heads: int, dropout_rate: float):
+        super().__init__()
+        self.self_attn = MultiHeadAttention(n_head=num_mha_heads, in_channels=in_channels, dropout_rate=dropout_rate)
+        self.feed_forward = GCFN(in_channels=in_channels, dropout_rate=dropout_rate)
+    def forward(self, x: torch.Tensor, num_spk: int):
+        """
+        Compute encoded features.
+            :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+            :param torch.Tensor mask: mask for x (batch, max_time_in)
+            :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        B, F, T = x.shape
+        x = x.view(B//num_spk, num_spk, F, T).contiguous()
+        x = x.permute([0, 3, 1, 2]).contiguous()
+        x = x.view(-1, num_spk, F).contiguous()
+        x = x + self.self_attn(x, None, None)
+        x = x.view(B//num_spk, T, num_spk, F).contiguous()
+        x = x.permute([0, 2, 3, 1]).contiguous()
+        x = x.view(B, F, T).contiguous()
+        x = x.permute([0, 2, 1])
+        x = self.feed_forward(x)
+        x = x.permute([0, 2, 1])
+        return x

models/SepReformer/SepReformer_Large_DM_WHAM/configs.yaml ADDED Viewed

	@@ -0,0 +1,129 @@

+config:
+    dataset:
+        max_len : 32000
+        sampling_rate: 8000
+        scp_dir: "data/scp_ss_8k_wham"
+        train:
+            mixture: "tr_mix.scp"
+            spk1: "tr_s1.scp"
+            spk2: "tr_s2.scp"
+            noise: "tr_n.scp"
+            dynamic_mixing: true
+        valid:
+            mixture: "cv_mix.scp"
+            spk1: "cv_s1.scp"
+            spk2: "cv_s2.scp"
+        test:
+            mixture: "tt_mix.scp"
+            spk1: "tt_s1.scp"
+            spk2: "tt_s2.scp"
+    dataloader:
+        batch_size: 2
+        pin_memory: false
+        num_workers: 12
+        drop_last: false
+    model:
+        num_stages: &var_model_num_stages 4 # R
+        num_spks: &var_model_num_spks 2
+        module_audio_enc:
+            in_channels: 1
+            out_channels: &var_model_audio_enc_out_channels 256
+            kernel_size: &var_model_audio_enc_kernel_size 16 # L
+            stride: &var_model_audio_enc_stride 4 # S
+            groups: 1
+            bias: false
+        module_feature_projector:
+            num_channels: *var_model_audio_enc_out_channels
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: &feature_projector_out_channels 256 # F
+            kernel_size: 1
+            bias: false
+        module_separator:
+            num_stages: *var_model_num_stages
+            relative_positional_encoding:
+                in_channels: *feature_projector_out_channels
+                num_heads: 8
+                maxlen: 2000
+                embed_v: false
+            enc_stage:
+                global_blocks:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.1
+                local_blocks:
+                    in_channels: *feature_projector_out_channels
+                    kernel_size: 65
+                    dropout_rate: 0.1
+                down_conv_layer:
+                    in_channels: *feature_projector_out_channels
+                    samp_kernel_size: &var_model_samp_kernel_size 5
+            spk_split_stage:
+                in_channels: *feature_projector_out_channels
+                num_spks: *var_model_num_spks
+            simple_fusion:
+                out_channels: *feature_projector_out_channels
+            dec_stage:
+                num_spks: *var_model_num_spks
+                global_blocks:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.1
+                local_blocks:
+                    in_channels: *feature_projector_out_channels
+                    kernel_size: 65
+                    dropout_rate: 0.1
+                spk_attention:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.1
+        module_output_layer:
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: *feature_projector_out_channels
+            num_spks: *var_model_num_spks
+        module_audio_dec:
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: 1
+            kernel_size: *var_model_audio_enc_kernel_size
+            stride: *var_model_audio_enc_stride
+            bias: false
+    criterion: ### Ref: https://pytorch.org/docs/stable/nn.html#loss-functions
+        name: ["PIT_SISNR_mag", "PIT_SISNR_time", "PIT_SISNRi", "PIT_SDRi"] ### Choose a torch.nn's loss function class(=attribute) e.g. ["L1Loss", "MSELoss", "CrossEntropyLoss", ...] / You can also build your optimizer :)
+        PIT_SISNR_mag:
+            frame_length: 512
+            frame_shift: 128
+            window: 'hann'
+            num_stages: *var_model_num_stages
+            num_spks: *var_model_num_spks
+            scale_inv: true
+            mel_opt: false
+        PIT_SISNR_time:
+            num_spks: *var_model_num_spks
+            scale_inv: true
+        PIT_SISNRi:
+            num_spks: *var_model_num_spks
+            scale_inv: true
+        PIT_SDRi:
+            dump: 0
+    optimizer: ### Ref: https://pytorch.org/docs/stable/optim.html#algorithms
+        name: ["AdamW"] ### Choose a torch.optim's class(=attribute) e.g. ["Adam", "AdamW", "SGD", ...] / You can also build your optimizer :)
+        AdamW:
+            lr: 2.0e-4
+            weight_decay: 1.0e-2
+    scheduler: ### Ref(+ find "How to adjust learning rate"): https://pytorch.org/docs/stable/optim.html#algorithms
+        name: ["ReduceLROnPlateau", "WarmupConstantSchedule"] ### Choose a torch.optim.lr_scheduler's class(=attribute) e.g. ["StepLR", "ReduceLROnPlateau", "Custom"] / You can also build your scheduler :)
+        ReduceLROnPlateau:
+            mode: "min"
+            min_lr: 1.0e-10
+            factor: 0.8
+            patience: 3
+        WarmupConstantSchedule:
+            warmup_steps: 1000
+    check_computations:
+        dummy_len: 16000
+    engine:
+        max_epoch: 200
+        gpuid: "1" ### "0"(single-gpu) or "0, 1" (multi-gpu)
+        mvn: false
+        clip_norm: 5
+        start_scheduling: 50
+        test_epochs: [50, 80, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 199]

models/SepReformer/SepReformer_Large_DM_WHAM/dataset.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import os
+import torch
+import random
+import librosa as audio_lib
+import numpy as np
+from utils import util_dataset
+from utils.decorators import *
+from loguru import logger
+from torch.utils.data import Dataset, DataLoader
+@logger_wraps()
+def get_dataloaders(args, dataset_config, loader_config):
+    # create dataset object for each partition
+    partitions = ["test"] if "test" in args.engine_mode  else ["train", "valid", "test"]
+    dataloaders = {}
+    for partition in partitions:
+        scp_config_mix = os.path.join(dataset_config["scp_dir"], dataset_config[partition]['mixture'])
+        scp_config_spk = [os.path.join(dataset_config["scp_dir"], dataset_config[partition][spk_key]) for spk_key in dataset_config[partition] if spk_key.startswith('spk')]
+        scp_config_noise = os.path.join(dataset_config["scp_dir"], dataset_config[partition]['noise']) if 'noise' in dataset_config[partition] else None
+        dynamic_mixing = dataset_config[partition]["dynamic_mixing"] if partition == 'train' else False
+        dataset = MyDataset(
+            max_len = dataset_config['max_len'],
+            fs = dataset_config['sampling_rate'],
+            partition = partition,
+            wave_scp_srcs = scp_config_spk,
+            wave_scp_mix = scp_config_mix,
+            wave_scp_noise = scp_config_noise,
+            dynamic_mixing = dynamic_mixing)
+        dataloader = DataLoader(
+            dataset = dataset,
+            batch_size = 1 if partition == 'test' else loader_config["batch_size"],
+            shuffle = False if partition == 'test' else True, # only train: (partition == 'train') / all: True
+            pin_memory = loader_config["pin_memory"],
+            num_workers = loader_config["num_workers"],
+            drop_last = loader_config["drop_last"],
+            collate_fn = _collate)
+        dataloaders[partition] = dataloader
+    return dataloaders
+def _collate(egs):
+    """
+        Transform utterance index into a minbatch
+        Arguments:
+            index: a list type [{},{},{}]
+        Returns:
+            input_sizes: a tensor correspond to utterance length
+            input_feats: packed sequence to feed networks
+            source_attr/target_attr: dictionary contains spectrogram/phase needed in loss computation
+    """
+    def __prepare_target_rir(dict_lsit, index):
+        return torch.nn.utils.rnn.pad_sequence([torch.tensor(d["src"][index], dtype=torch.float32)  for d in dict_lsit], batch_first=True)
+    if type(egs) is not list: raise ValueError("Unsupported index type({})".format(type(egs)))
+    num_spks = 2 # you need to set this paramater by yourself
+    dict_list = sorted([eg for eg in egs], key=lambda x: x['num_sample'], reverse=True)
+    mixture = torch.nn.utils.rnn.pad_sequence([torch.tensor(d['mix'], dtype=torch.float32) for d in dict_list], batch_first=True)
+    src = [__prepare_target_rir(dict_list, index) for index in range(num_spks)]
+    input_sizes = torch.tensor([d['num_sample'] for d in dict_list], dtype=torch.float32)
+    key = [d['key'] for d in dict_list]
+    return input_sizes, mixture, src, key
+@logger_wraps()
+class MyDataset(Dataset):
+    def __init__(self, max_len, fs, partition, wave_scp_srcs, wave_scp_mix, wave_scp_noise=None, dynamic_mixing=False, speed_list=None):
+        self.partition = partition
+        for wave_scp_src in wave_scp_srcs:
+            if not os.path.exists(wave_scp_src): raise FileNotFoundError(f"Could not find file {wave_scp_src}")
+        self.max_len = max_len
+        self.fs = fs
+        self.wave_dict_srcs = [util_dataset.parse_scps(wave_scp_src) for wave_scp_src in wave_scp_srcs]
+        self.wave_dict_mix = util_dataset.parse_scps(wave_scp_mix)
+        self.wave_dict_noise = util_dataset.parse_scps(wave_scp_noise) if wave_scp_noise else None
+        self.wave_keys = list(self.wave_dict_mix.keys())
+        logger.info(f"Create MyDataset for {wave_scp_mix} with {len(self.wave_dict_mix)} utterances")
+        self.dynamic_mixing = dynamic_mixing
+    def __len__(self):
+        return len(self.wave_dict_mix)
+    def __contains__(self, key):
+        return key in self.wave_dict_mix
+    def _dynamic_mixing(self, key):
+        def __match_length(wav, len_data) :
+            leftover = len(wav) - len_data
+            idx = random.randint(0,leftover)
+            wav = wav[idx:idx+len_data]
+            return wav
+        samps_src = []
+        src_len = [self.max_len]
+        # dyanmic source choice
+        key_random = random.choice(list(self.wave_dict_srcs[0].keys()))
+        idx1, idx2 = (0, 1) if random.random() > 0.5 else (1, 0)
+        files = [self.wave_dict_srcs[idx1][key], self.wave_dict_srcs[idx2][key_random]]
+        # load
+        for idx, file in enumerate(files):
+            if not os.path.exists(file): raise FileNotFoundError("Input file {} do not exists!".format(file))
+            samps_tmp, _ = audio_lib.load(file, sr=self.fs)
+            if idx == 0: ref_rms = np.sqrt(np.mean(np.square(samps_tmp)))
+            curr_rms = np.sqrt(np.mean(np.square(samps_tmp)))
+            norm_factor = ref_rms / curr_rms
+            samps_tmp *= norm_factor
+            # mixing with random gains
+            gain = pow(10,-random.uniform(-5,5)/20)
+            samps_tmp = np.array(torch.tensor(samps_tmp))
+            samps_src.append(gain*samps_tmp)
+            src_len.append(len(samps_tmp))
+        # matching the audio length
+        min_len = min(src_len)
+        # add noise source
+        file_noise = self.wave_dict_noise[key]
+        samps_noise, _ = audio_lib.load(file_noise, sr=self.fs)
+        curr_rms = np.sqrt(np.mean(np.square(samps_noise)))
+        norm_factor = ref_rms / curr_rms
+        samps_noise *= norm_factor
+        gain_noise = pow(10,-random.uniform(-5,5)/20)
+        samps_noise = samps_noise*gain_noise
+        src_len.append(len(samps_noise))
+        # truncate
+        min_len = min(src_len)
+        samps_src = [__match_length(s, min_len) for s in samps_src]
+        samps_noise = __match_length(samps_noise, min_len)
+        samps_mix = sum(samps_src) + samps_noise
+        if len(samps_mix)%4 != 0:
+            remains = len(samps_mix)%4
+            samps_mix = samps_mix[:-remains]
+            samps_src = [s[:-remains] for s in samps_src]
+        return samps_mix, samps_src
+    def _direct_load(self, key):
+        samps_src = []
+        files = [wave_dict_src[key] for wave_dict_src in self.wave_dict_srcs]
+        for file in files:
+            if not os.path.exists(file): raise FileNotFoundError(f"Input file {file} do not exists!")
+            samps_tmp, _ = audio_lib.load(file, sr=self.fs)
+            samps_src.append(samps_tmp)
+        file = self.wave_dict_mix[key]
+        if not os.path.exists(file): raise FileNotFoundError(f"Input file {file} do not exists!")
+        samps_mix, _ = audio_lib.load(file, sr=self.fs)
+        # Truncate samples as needed
+        if len(samps_mix) % 4 != 0:
+            remains = len(samps_mix) % 4
+            samps_mix = samps_mix[:-remains]
+            samps_src = [s[:-remains] for s in samps_src]
+        if self.partition != "test":
+            if len(samps_mix) > self.max_len:
+                start = random.randint(0,len(samps_mix)-self.max_len)
+                samps_mix = samps_mix[start:start+self.max_len]
+                samps_src = [s[start:start+self.max_len] for s in samps_src]
+        return samps_mix, samps_src
+    def __getitem__(self, index):
+        key = self.wave_keys[index]
+        if any(key not in self.wave_dict_srcs[i] for i in range(len(self.wave_dict_srcs))) or key not in self.wave_dict_mix: raise KeyError(f"Could not find utterance {key}")
+        samps_mix, samps_src = self._dynamic_mixing(key) if self.dynamic_mixing else self._direct_load(key)
+        return {"num_sample": samps_mix.shape[0], "mix": samps_mix, "src": samps_src, "key": key}

models/SepReformer/SepReformer_Large_DM_WHAM/engine.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import os
+import torch
+import csv
+import time
+import soundfile as sf
+from loguru import logger
+from tqdm import tqdm
+from utils import util_engine, functions
+from utils.decorators import *
+from torch.utils.tensorboard import SummaryWriter
+@logger_wraps()
+class Engine(object):
+    def __init__(self, args, config, model, dataloaders, criterions, optimizers, schedulers, gpuid, device):
+        ''' Default setting '''
+        self.engine_mode = args.engine_mode
+        self.out_wav_dir = args.out_wav_dir
+        self.config = config
+        self.gpuid = gpuid
+        self.device = device
+        self.model = model.to(self.device)
+        self.dataloaders = dataloaders # self.dataloaders['train'] or ['valid'] or ['test']
+        self.PIT_SISNR_mag_loss, self.PIT_SISNR_time_loss, self.PIT_SISNRi_loss, self.PIT_SDRi_loss = criterions
+        self.main_optimizer = optimizers[0]
+        self.main_scheduler, self.warmup_scheduler = schedulers
+        self.pretrain_weights_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log", "pretrain_weights")
+        os.makedirs(self.pretrain_weights_path, exist_ok=True)
+        self.scratch_weights_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log", "scratch_weights")
+        os.makedirs(self.scratch_weights_path, exist_ok=True)
+        self.checkpoint_path = self.pretrain_weights_path if any(file.endswith(('.pt', '.pt', '.pkl')) for file in os.listdir(self.pretrain_weights_path)) else self.scratch_weights_path
+        self.start_epoch = util_engine.load_last_checkpoint_n_get_epoch(self.checkpoint_path, self.model, self.main_optimizer, location=self.device)
+        # Logging
+        util_engine.model_params_mac_summary(
+            model=self.model,
+            input=torch.randn(1, self.config['check_computations']['dummy_len']).to(self.device),
+            dummy_input=torch.rand(1, self.config['check_computations']['dummy_len']).to(self.device),
+            metrics=['ptflops', 'thop', 'torchinfo']
+            # metrics=['ptflops']
+        )
+        logger.info(f"Clip gradient by 2-norm {self.config['engine']['clip_norm']}")
+    @logger_wraps()
+    def _train(self, dataloader, epoch):
+        self.model.train()
+        tot_loss_freq = [0 for _ in range(self.model.num_stages)]
+        tot_loss_time, num_batch = 0, 0
+        pbar = tqdm(total=len(dataloader), unit='batches', bar_format='{l_bar}{bar:25}{r_bar}{bar:-10b}', colour="YELLOW", dynamic_ncols=True)
+        for input_sizes, mixture, src, _ in dataloader:
+            nnet_input = mixture
+            nnet_input = functions.apply_cmvn(nnet_input) if self.config['engine']['mvn'] else nnet_input
+            num_batch += 1
+            pbar.update(1)
+            # Scheduler learning rate for warm-up (Iteration-based update for transformers)
+            if epoch == 1: self.warmup_scheduler.step()
+            nnet_input = nnet_input.to(self.device)
+            self.main_optimizer.zero_grad()
+            estim_src, estim_src_bn = torch.nn.parallel.data_parallel(self.model, nnet_input, device_ids=self.gpuid)
+            cur_loss_s_bn = 0
+            cur_loss_s_bn = []
+            for idx, estim_src_value in enumerate(estim_src_bn):
+                cur_loss_s_bn.append(self.PIT_SISNR_mag_loss(estims=estim_src_value, idx=idx, input_sizes=input_sizes, target_attr=src))
+                tot_loss_freq[idx] += cur_loss_s_bn[idx].item() / (self.config['model']['num_spks'])
+            cur_loss_s = self.PIT_SISNR_time_loss(estims=estim_src, input_sizes=input_sizes, target_attr=src)
+            tot_loss_time += cur_loss_s.item() / self.config['model']['num_spks']
+            alpha = 0.4 * 0.8**(1+(epoch-101)//5) if epoch > 100 else 0.4
+            cur_loss = (1-alpha) * cur_loss_s + alpha * sum(cur_loss_s_bn) / len(cur_loss_s_bn)
+            cur_loss = cur_loss / self.config['model']['num_spks']
+            cur_loss.backward()
+            if self.config['engine']['clip_norm']: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config['engine']['clip_norm'])
+            self.main_optimizer.step()
+            dict_loss = {"T_Loss": tot_loss_time / num_batch}
+            dict_loss.update({'F_Loss_' + str(idx): loss / num_batch for idx, loss in enumerate(tot_loss_freq)})
+            pbar.set_postfix(dict_loss)
+        pbar.close()
+        tot_loss_freq = sum(tot_loss_freq) / len(tot_loss_freq)
+        return tot_loss_time / num_batch, tot_loss_freq / num_batch, num_batch
+    @logger_wraps()
+    def _validate(self, dataloader):
+        self.model.eval()
+        tot_loss_freq = [0 for _ in range(self.model.num_stages)]
+        tot_loss_time, num_batch = 0, 0
+        pbar = tqdm(total=len(dataloader), unit='batches', bar_format='{l_bar}{bar:5}{r_bar}{bar:-10b}', colour="RED", dynamic_ncols=True)
+        with torch.inference_mode():
+            for input_sizes, mixture, src, _ in dataloader:
+                nnet_input = mixture
+                nnet_input = functions.apply_cmvn(nnet_input) if self.config['engine']['mvn'] else nnet_input
+                nnet_input = nnet_input.to(self.device)
+                num_batch += 1
+                pbar.update(1)
+                estim_src, estim_src_bn = torch.nn.parallel.data_parallel(self.model, nnet_input, device_ids=self.gpuid)
+                cur_loss_s_bn = []
+                for idx, estim_src_value in enumerate(estim_src_bn):
+                    cur_loss_s_bn.append(self.PIT_SISNR_mag_loss(estims=estim_src_value, idx=idx, input_sizes=input_sizes, target_attr=src))
+                    tot_loss_freq[idx] += cur_loss_s_bn[idx].item() / (self.config['model']['num_spks'])
+                cur_loss_s_SDR = self.PIT_SISNR_time_loss(estims=estim_src, input_sizes=input_sizes, target_attr=src)
+                tot_loss_time += cur_loss_s_SDR.item() / self.config['model']['num_spks']
+                dict_loss = {"T_Loss":tot_loss_time / num_batch}
+                dict_loss.update({'F_Loss_' + str(idx): loss / num_batch for idx, loss in enumerate(tot_loss_freq)})
+                pbar.set_postfix(dict_loss)
+        pbar.close()
+        tot_loss_freq = sum(tot_loss_freq) / len(tot_loss_freq)
+        return tot_loss_time / num_batch, tot_loss_freq / num_batch, num_batch
+    @logger_wraps()
+    def _test(self, dataloader, wav_dir=None):
+        self.model.eval()
+        total_loss_SISNRi, total_loss_SDRi, num_batch = 0, 0, 0
+        pbar = tqdm(total=len(dataloader), unit='batches', bar_format='{l_bar}{bar:5}{r_bar}{bar:-10b}', colour="grey", dynamic_ncols=True)
+        with torch.inference_mode():
+            csv_file_name_sisnr = os.path.join(os.path.dirname(__file__),'test_SISNRi_value.csv')
+            csv_file_name_sdr = os.path.join(os.path.dirname(__file__),'test_SDRi_value.csv')
+            with open(csv_file_name_sisnr, 'w', newline='') as csvfile_sisnr, open(csv_file_name_sdr, 'w', newline='') as csvfile_sdr:
+                idx = 0
+                writer_sisnr = csv.writer(csvfile_sisnr, quotechar='|', quoting=csv.QUOTE_MINIMAL)
+                writer_sdr = csv.writer(csvfile_sdr, quotechar='|', quoting=csv.QUOTE_MINIMAL)
+                for input_sizes, mixture, src, key in dataloader:
+                    if len(key) > 1:
+                        raise("batch size is not one!!")
+                    nnet_input = mixture.to(self.device)
+                    num_batch += 1
+                    pbar.update(1)
+                    estim_src, _ = torch.nn.parallel.data_parallel(self.model, nnet_input, device_ids=self.gpuid)
+                    cur_loss_SISNRi, cur_loss_SISNRi_src = self.PIT_SISNRi_loss(estims=estim_src, mixture=mixture, input_sizes=input_sizes, target_attr=src, eps=1.0e-15)
+                    total_loss_SISNRi += cur_loss_SISNRi.item() / self.config['model']['num_spks']
+                    cur_loss_SDRi, cur_loss_SDRi_src = self.PIT_SDRi_loss(estims=estim_src, mixture=mixture, input_sizes=input_sizes, target_attr=src)
+                    total_loss_SDRi += cur_loss_SDRi.item() / self.config['model']['num_spks']
+                    writer_sisnr.writerow([key[0][:-4]] + [cur_loss_SISNRi_src[i].item() for i in range(self.config['model']['num_spks'])])
+                    writer_sdr.writerow([key[0][:-4]] + [cur_loss_SDRi_src[i].item() for i in range(self.config['model']['num_spks'])])
+                    if self.engine_mode == "test_save":
+                        if wav_dir == None: wav_dir = os.path.join(os.path.dirname(__file__),"wav_out")
+                        if wav_dir and not os.path.exists(wav_dir): os.makedirs(wav_dir)
+                        mixture = torch.squeeze(mixture).cpu().data.numpy()
+                        sf.write(os.path.join(wav_dir,key[0][:-4]+str(idx)+'_mixture.wav'), 0.5*mixture/max(abs(mixture)), 8000)
+                        for i in range(self.config['model']['num_spks']):
+                            src = torch.squeeze(estim_src[i]).cpu().data.numpy()
+                            sf.write(os.path.join(wav_dir,key[0][:-4]+str(idx)+'_out_'+str(i)+'.wav'), 0.5*src/max(abs(src)), 8000)
+                    idx += 1
+                    dict_loss = {"SiSNRi": total_loss_SISNRi/num_batch, "SDRi": total_loss_SDRi/num_batch}
+                    pbar.set_postfix(dict_loss)
+        pbar.close()
+        return total_loss_SISNRi/num_batch, total_loss_SDRi/num_batch, num_batch
+    @logger_wraps()
+    def run(self):
+        with torch.cuda.device(self.device):
+            writer_src = SummaryWriter(os.path.join(os.path.dirname(os.path.abspath(__file__)), "log/tensorboard"))
+            if "test" in self.engine_mode:
+                on_test_start = time.time()
+                test_loss_src_time_1, test_loss_src_time_2, test_num_batch = self._test(self.dataloaders['test'], self.out_wav_dir)
+                on_test_end = time.time()
+                logger.info(f"[TEST] Loss(time/mini-batch) \n - Epoch {self.start_epoch:2d}: SISNRi = {test_loss_src_time_1:.4f} dB | SDRi = {test_loss_src_time_2:.4f} dB | Speed = ({on_test_end - on_test_start:.2f}s/{test_num_batch:d})")
+                logger.info(f"Testing done!")
+            else:
+                start_time = time.time()
+                if self.start_epoch > 1:
+                    init_loss_time, init_loss_freq, valid_num_batch = self._validate(self.dataloaders['valid'])
+                else:
+                    init_loss_time, init_loss_freq = 0, 0
+                end_time = time.time()
+                logger.info(f"[INIT] Loss(time/mini-batch) \n - Epoch {self.start_epoch:2d}: Loss_t = {init_loss_time:.4f} dB | Loss_f = {init_loss_freq:.4f} dB | Speed = ({end_time-start_time:.2f}s)")
+                for epoch in range(self.start_epoch, self.config['engine']['max_epoch']):
+                    valid_loss_best = init_loss_time
+                    train_start_time = time.time()
+                    train_loss_src_time, train_loss_src_freq, train_num_batch = self._train(self.dataloaders['train'], epoch)
+                    train_end_time = time.time()
+                    valid_start_time = time.time()
+                    valid_loss_src_time, valid_loss_src_freq, valid_num_batch = self._validate(self.dataloaders['valid'])
+                    valid_end_time = time.time()
+                    if epoch > self.config['engine']['start_scheduling']: self.main_scheduler.step(valid_loss_src_time)
+                    logger.info(f"[TRAIN] Loss(time/mini-batch) \n - Epoch {epoch:2d}: Loss_t = {train_loss_src_time:.4f} dB | Loss_f = {train_loss_src_freq:.4f} dB | Speed = ({train_end_time - train_start_time:.2f}s/{train_num_batch:d})")
+                    logger.info(f"[VALID] Loss(time/mini-batch) \n - Epoch {epoch:2d}: Loss_t = {valid_loss_src_time:.4f} dB | Loss_f = {valid_loss_src_freq:.4f} dB | Speed = ({valid_end_time - valid_start_time:.2f}s/{valid_num_batch:d})")
+                    if epoch in self.config['engine']['test_epochs']:
+                        on_test_start = time.time()
+                        test_loss_src_time_1, test_loss_src_time_2, test_num_batch = self._test(self.dataloaders['test'])
+                        on_test_end = time.time()
+                        logger.info(f"[TEST] Loss(time/mini-batch) \n - Epoch {epoch:2d}: SISNRi = {test_loss_src_time_1:.4f} dB | SDRi = {test_loss_src_time_2:.4f} dB | Speed = ({on_test_end - on_test_start:.2f}s/{test_num_batch:d})")
+                    valid_loss_best = util_engine.save_checkpoint_per_best(valid_loss_best, valid_loss_src_time, train_loss_src_time, epoch, self.model, self.main_optimizer, self.checkpoint_path)
+                    # Logging to monitoring tools (Tensorboard && Wandb)
+                    writer_src.add_scalars("Metrics", {
+                        'Loss_train_time': train_loss_src_time,
+                        'Loss_valid_time': valid_loss_src_time}, epoch)
+                    writer_src.add_scalar("Learning Rate", self.main_optimizer.param_groups[0]['lr'], epoch)
+                    writer_src.flush()
+                logger.info(f"Training for {self.config['engine']['max_epoch']} epoches done!")

models/SepReformer/SepReformer_Large_DM_WHAM/main.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import torch
+from loguru import logger
+from .dataset import get_dataloaders
+from .model import Model
+from .engine import Engine
+from utils import util_system, util_implement
+from utils.decorators import *
+# Setup logger
+log_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log/system_log.log")
+logger.add(log_file_path, level="DEBUG", mode="w")
+@logger_wraps()
+def main(args):
+    ''' Build Setting '''
+    # Call configuration file (configs.yaml)
+    yaml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs.yaml")
+    yaml_dict = util_system.parse_yaml(yaml_path)
+    # Run wandb and get configuration
+    config = yaml_dict["config"] # wandb login success or fail
+    # Call DataLoader [train / valid / test / etc...]
+    dataloaders = get_dataloaders(args, config["dataset"], config["dataloader"])
+    ''' Build Model '''
+    # Call network model
+    model = Model(**config["model"])
+    ''' Build Engine '''
+    # Call gpu id & device
+    gpuid = tuple(map(int, config["engine"]["gpuid"].split(',')))
+    device = torch.device(f'cuda:{gpuid[0]}')
+    # Call Implement [criterion / optimizer / scheduler]
+    criterions = util_implement.CriterionFactory(config["criterion"], device).get_criterions()
+    optimizers = util_implement.OptimizerFactory(config["optimizer"], model.parameters()).get_optimizers()
+    schedulers = util_implement.SchedulerFactory(config["scheduler"], optimizers).get_schedulers()
+    # Call & Run Engine
+    engine = Engine(args, config, model, dataloaders, criterions, optimizers, schedulers, gpuid, device)
+    engine.run()

models/SepReformer/SepReformer_Large_DM_WHAM/model.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import sys
+sys.path.append('../')
+import torch
+import warnings
+warnings.filterwarnings('ignore')
+from utils.decorators import *
+from .modules.module import *
+@logger_wraps()
+class Model(torch.nn.Module):
+    def __init__(self,
+                 num_stages: int,
+                 num_spks: int,
+                 module_audio_enc: dict,
+                 module_feature_projector: dict,
+                 module_separator: dict,
+                 module_output_layer: dict,
+                 module_audio_dec: dict):
+        super().__init__()
+        self.num_stages = num_stages
+        self.num_spks = num_spks
+        self.audio_encoder = AudioEncoder(**module_audio_enc)
+        self.feature_projector = FeatureProjector(**module_feature_projector)
+        self.separator = Separator(**module_separator)
+        self.out_layer = OutputLayer(**module_output_layer)
+        self.audio_decoder = AudioDecoder(**module_audio_dec)
+        # Aux_loss
+        self.out_layer_bn = torch.nn.ModuleList([])
+        self.decoder_bn = torch.nn.ModuleList([])
+        for _ in range(self.num_stages):
+            self.out_layer_bn.append(OutputLayer(**module_output_layer, masking=True))
+            self.decoder_bn.append(AudioDecoder(**module_audio_dec))
+    def forward(self, x):
+        encoder_output = self.audio_encoder(x)
+        projected_feature = self.feature_projector(encoder_output)
+        last_stage_output, each_stage_outputs = self.separator(projected_feature)
+        out_layer_output = self.out_layer(last_stage_output, encoder_output)
+        each_spk_output = [out_layer_output[idx] for idx in range(self.num_spks)]
+        audio = [self.audio_decoder(each_spk_output[idx]) for idx in range(self.num_spks)]
+        # Aux_loss
+        audio_aux = []
+        for idx, each_stage_output in enumerate(each_stage_outputs):
+            each_stage_output = self.out_layer_bn[idx](torch.nn.functional.upsample(each_stage_output, encoder_output.shape[-1]), encoder_output)
+            out_aux = [each_stage_output[jdx] for jdx in range(self.num_spks)]
+            audio_aux.append([self.decoder_bn[idx](out_aux[jdx])[...,:x.shape[-1]] for jdx in range(self.num_spks)])
+        return audio, audio_aux

models/SepReformer/SepReformer_Large_DM_WHAM/modules/module.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import sys
+sys.path.append('../')
+import torch
+import warnings
+warnings.filterwarnings('ignore')
+from utils.decorators import *
+from .network import *
+class AudioEncoder(torch.nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int, groups: int, bias: bool):
+        super().__init__()
+        self.conv1d = torch.nn.Conv1d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, groups=groups, bias=bias)
+        self.gelu = torch.nn.GELU()
+    def forward(self, x: torch.Tensor):
+        x = torch.unsqueeze(x, dim=0) if len(x.shape) == 1 else torch.unsqueeze(x, dim=1) # [T] - >[1, T] OR [B, T] -> [B, 1, T]
+        x = self.conv1d(x)
+        x = self.gelu(x)
+        return x
+class FeatureProjector(torch.nn.Module):
+    def __init__(self, num_channels: int, in_channels: int, out_channels: int, kernel_size: int, bias: bool):
+        super().__init__()
+        self.norm = torch.nn.GroupNorm(num_groups=1, num_channels=num_channels, eps=1e-8)
+        self.conv1d = torch.nn.Conv1d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, bias=bias)
+    def forward(self, x: torch.Tensor):
+        x = self.norm(x)
+        x = self.conv1d(x)
+        return x
+class Separator(torch.nn.Module):
+    def __init__(self, num_stages: int, relative_positional_encoding: dict, enc_stage: dict, spk_split_stage: dict, simple_fusion:dict, dec_stage: dict):
+        super().__init__()
+        class RelativePositionalEncoding(torch.nn.Module):
+            def __init__(self, in_channels: int, num_heads: int, maxlen: int, embed_v=False):
+                super().__init__()
+                self.in_channels = in_channels
+                self.num_heads = num_heads
+                self.embedding_dim = self.in_channels // self.num_heads
+                self.maxlen = maxlen
+                self.pe_k = torch.nn.Embedding(num_embeddings=2*maxlen, embedding_dim=self.embedding_dim)
+                self.pe_v = torch.nn.Embedding(num_embeddings=2*maxlen, embedding_dim=self.embedding_dim) if embed_v else None
+            def forward(self, pos_seq: torch.Tensor):
+                pos_seq.clamp_(-self.maxlen, self.maxlen - 1)
+                pos_seq += self.maxlen
+                pe_k_output = self.pe_k(pos_seq)
+                pe_v_output = self.pe_v(pos_seq) if self.pe_v is not None else None
+                return pe_k_output, pe_v_output
+        class SepEncStage(torch.nn.Module):
+            def __init__(self, global_blocks: dict, local_blocks: dict, down_conv_layer: dict, down_conv=True):
+                super().__init__()
+                class DownConvLayer(torch.nn.Module):
+                    def __init__(self, in_channels: int, samp_kernel_size: int):
+                        """Construct an EncoderLayer object."""
+                        super().__init__()
+                        self.down_conv = torch.nn.Conv1d(
+                            in_channels=in_channels, out_channels=in_channels, kernel_size=samp_kernel_size, stride=2, padding=(samp_kernel_size-1)//2, groups=in_channels)
+                        self.BN = torch.nn.BatchNorm1d(num_features=in_channels)
+                        self.gelu = torch.nn.GELU()
+                    def forward(self, x: torch.Tensor):
+                        x = x.permute([0, 2, 1])
+                        x = self.down_conv(x)
+                        x = self.BN(x)
+                        x = self.gelu(x)
+                        x = x.permute([0, 2, 1])
+                        return x
+                self.g_block_1 = GlobalBlock(**global_blocks)
+                self.l_block_1 = LocalBlock(**local_blocks)
+                self.g_block_2 = GlobalBlock(**global_blocks)
+                self.l_block_2 = LocalBlock(**local_blocks)
+                self.downconv = DownConvLayer(**down_conv_layer) if down_conv == True else None
+            def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+                '''
+                x: [B, N, T]
+                '''
+                x = self.g_block_1(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_1(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.g_block_2(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_2(x)
+                x = x.permute(0, 2, 1).contiguous()
+                skip = x
+                if self.downconv:
+                    x = x.permute(0, 2, 1).contiguous()
+                    x = self.downconv(x)
+                    x = x.permute(0, 2, 1).contiguous()
+                # [BK, S, N]
+                return x, skip
+        class SpkSplitStage(torch.nn.Module):
+            def __init__(self, in_channels: int, num_spks: int):
+                super().__init__()
+                self.linear = torch.nn.Sequential(
+                    torch.nn.Conv1d(in_channels, 4*in_channels*num_spks, kernel_size=1),
+                    torch.nn.GLU(dim=-2),
+                    torch.nn.Conv1d(2*in_channels*num_spks, in_channels*num_spks, kernel_size=1))
+                self.norm = torch.nn.GroupNorm(1, in_channels, eps=1e-8)
+                self.num_spks = num_spks
+            def forward(self, x: torch.Tensor):
+                x = self.linear(x)
+                B, _, T = x.shape
+                x = x.view(B*self.num_spks,-1, T).contiguous()
+                x = self.norm(x)
+                return x
+        class SepDecStage(torch.nn.Module):
+            def __init__(self, num_spks: int, global_blocks: dict, local_blocks: dict, spk_attention: dict):
+                super().__init__()
+                self.g_block_1 = GlobalBlock(**global_blocks)
+                self.l_block_1 = LocalBlock(**local_blocks)
+                self.spk_attn_1 = SpkAttention(**spk_attention)
+                self.g_block_2 = GlobalBlock(**global_blocks)
+                self.l_block_2 = LocalBlock(**local_blocks)
+                self.spk_attn_2 = SpkAttention(**spk_attention)
+                self.g_block_3 = GlobalBlock(**global_blocks)
+                self.l_block_3 = LocalBlock(**local_blocks)
+                self.spk_attn_3 = SpkAttention(**spk_attention)
+                self.num_spk = num_spks
+            def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+                '''
+                x: [B, N, T]
+                '''
+                # [BS, K, H]
+                x = self.g_block_1(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_1(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.spk_attn_1(x, self.num_spk)
+                x = self.g_block_2(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_2(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.spk_attn_2(x, self.num_spk)
+                x = self.g_block_3(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_3(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.spk_attn_3(x, self.num_spk)
+                skip = x
+                return x, skip
+        self.num_stages = num_stages
+        self.pos_emb = RelativePositionalEncoding(**relative_positional_encoding)
+        # Temporal Contracting Part
+        self.enc_stages = torch.nn.ModuleList([])
+        for _ in range(self.num_stages):
+            self.enc_stages.append(SepEncStage(**enc_stage, down_conv=True))
+        self.bottleneck_G = SepEncStage(**enc_stage, down_conv=False)
+        self.spk_split_blocks = torch.nn.ModuleList([])
+        for _ in range(self.num_stages+1):
+            self.spk_split_blocks.append(SpkSplitStage(**spk_split_stage))
+        # Temporal Expanding Part
+        self.simple_fusion = torch.nn.ModuleList([])
+        self.dec_stages = torch.nn.ModuleList([])
+        for _ in range(self.num_stages):
+            self.simple_fusion.append(torch.nn.Conv1d(in_channels=simple_fusion['out_channels']*2,out_channels=simple_fusion['out_channels'], kernel_size=1))
+            self.dec_stages.append(SepDecStage(**dec_stage))
+    def forward(self, input: torch.Tensor):
+        '''input: [B, N, L]'''
+        # feature projection
+        x, _ = self.pad_signal(input)
+        len_x = x.shape[-1]
+        # Temporal Contracting Part
+        pos_seq = torch.arange(0, len_x//2**self.num_stages).long().to(x.device)
+        pos_seq = pos_seq[:, None] - pos_seq[None, :]
+        pos_k, _ = self.pos_emb(pos_seq)
+        skip = []
+        for idx in range(self.num_stages):
+            x, skip_ = self.enc_stages[idx](x, pos_k)
+            skip_ = self.spk_split_blocks[idx](skip_)
+            skip.append(skip_)
+        x, _ = self.bottleneck_G(x, pos_k)
+        x = self.spk_split_blocks[-1](x) # B, 2F, T
+        each_stage_outputs = []
+        # Temporal Expanding Part
+        for idx in range(self.num_stages):
+            each_stage_outputs.append(x)
+            idx_en = self.num_stages - (idx + 1)
+            x = torch.nn.functional.upsample(x, skip[idx_en].shape[-1])
+            x = torch.cat([x,skip[idx_en]],dim=1)
+            x = self.simple_fusion[idx](x)
+            x, _ = self.dec_stages[idx](x, pos_k)
+        last_stage_output = x
+        return last_stage_output, each_stage_outputs
+    def pad_signal(self, input: torch.Tensor):
+        #  (B, T) or (B, 1, T)
+        if input.dim() == 1: input = input.unsqueeze(0)
+        elif input.dim() not in [2, 3]: raise RuntimeError("Input can only be 2 or 3 dimensional.")
+        elif input.dim() == 2: input = input.unsqueeze(1)
+        L = 2**self.num_stages
+        batch_size = input.size(0)
+        ndim = input.size(1)
+        nframe = input.size(2)
+        padded_len = (nframe//L + 1)*L
+        rest = 0 if nframe%L == 0 else padded_len - nframe
+        if rest > 0:
+            pad = torch.autograd.Variable(torch.zeros(batch_size, ndim, rest)).type(input.type()).to(input.device)
+            input = torch.cat([input, pad], dim=-1)
+        return input, rest
+class OutputLayer(torch.nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, num_spks: int, masking: bool = False):
+        super().__init__()
+        # feature expansion back
+        self.masking = masking
+        self.spe_block = Masking(in_channels, Activation_mask="ReLU", concat_opt=None)
+        self.num_spks = num_spks
+        self.end_conv1x1 = torch.nn.Sequential(
+            torch.nn.Linear(out_channels, 4*out_channels),
+            torch.nn.GLU(),
+            torch.nn.Linear(2*out_channels, in_channels))
+    def forward(self, x: torch.Tensor, input: torch.Tensor):
+        x = x[...,:input.shape[-1]]
+        x = x.permute([0, 2, 1])
+        x = self.end_conv1x1(x)
+        x = x.permute([0, 2, 1])
+        B, N, L = x.shape
+        B = B // self.num_spks
+        if self.masking:
+            input = input.expand(self.num_spks, B, N, L).transpose(0,1).contiguous()
+            input = input.view(B*self.num_spks, N, L)
+            x = self.spe_block(x, input)
+        x = x.view(B, self.num_spks, N, L)
+        # [spks, B, N, L]
+        x = x.transpose(0, 1)
+        return x
+class AudioDecoder(torch.nn.ConvTranspose1d):
+    '''
+        Decoder of the TasNet
+        This module can be seen as the gradient of Conv1d with respect to its input.
+        It is also known as a fractionally-strided convolution
+        or a deconvolution (although it is not an actual deconvolution operation).
+    '''
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x):
+        # x: [B, N, L]
+        if x.dim() not in [2, 3]: raise RuntimeError("{} accept 3/4D tensor as input".format(self.__name__))
+        x = super().forward(x if x.dim() == 3 else torch.unsqueeze(x, 1))
+        x = torch.squeeze(x, dim=1) if torch.squeeze(x).dim() == 1 else torch.squeeze(x)
+        return x

models/SepReformer/SepReformer_Large_DM_WHAM/modules/network.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import torch
+import math
+import numpy
+from utils.decorators import *
+class LayerScale(torch.nn.Module):
+    def __init__(self, dims, input_size, Layer_scale_init=1.0e-5):
+        super().__init__()
+        if dims == 1:
+            self.layer_scale = torch.nn.Parameter(torch.ones(input_size)*Layer_scale_init, requires_grad=True)
+        elif dims == 2:
+            self.layer_scale = torch.nn.Parameter(torch.ones(1,input_size)*Layer_scale_init, requires_grad=True)
+        elif dims == 3:
+            self.layer_scale = torch.nn.Parameter(torch.ones(1,1,input_size)*Layer_scale_init, requires_grad=True)
+    def forward(self, x):
+        return x*self.layer_scale
+class Masking(torch.nn.Module):
+    def __init__(self, input_dim, Activation_mask='Sigmoid', **options):
+        super(Masking, self).__init__()
+        self.options = options
+        if self.options['concat_opt']:
+            self.pw_conv = torch.nn.Conv1d(input_dim*2, input_dim, 1, stride=1, padding=0)
+        if Activation_mask == 'Sigmoid':
+            self.gate_act = torch.nn.Sigmoid()
+        elif Activation_mask == 'ReLU':
+            self.gate_act = torch.nn.ReLU()
+    def forward(self, x, skip):
+        if self.options['concat_opt']:
+            y = torch.cat([x, skip], dim=-2)
+            y = self.pw_conv(y)
+        else:
+            y = x
+        y = self.gate_act(y) * skip
+        return y
+class GCFN(torch.nn.Module):
+    def __init__(self, in_channels, dropout_rate, Layer_scale_init=1.0e-5):
+        super().__init__()
+        self.net1 = torch.nn.Sequential(
+            torch.nn.LayerNorm(in_channels),
+            torch.nn.Linear(in_channels, in_channels*6))
+        self.depthwise = torch.nn.Conv1d(in_channels*6, in_channels*6, 3, padding=1, groups=in_channels*6)
+        self.net2 = torch.nn.Sequential(
+            torch.nn.GLU(),
+            torch.nn.Dropout(dropout_rate),
+            torch.nn.Linear(in_channels*3, in_channels),
+            torch.nn.Dropout(dropout_rate))
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x):
+        y = self.net1(x)
+        y = y.permute(0, 2, 1).contiguous()
+        y = self.depthwise(y)
+        y = y.permute(0, 2, 1).contiguous()
+        y = self.net2(y)
+        return x + self.Layer_scale(y)
+class MultiHeadAttention(torch.nn.Module):
+    """
+    Multi-Head Attention layer.
+        :param int n_head: the number of head s
+        :param int n_feat: the number of features
+        :param float dropout_rate: dropout rate
+    """
+    def __init__(self, n_head: int, in_channels: int, dropout_rate: float, Layer_scale_init=1.0e-5):
+        super().__init__()
+        assert in_channels % n_head == 0
+        self.d_k = in_channels // n_head # We assume d_v always equals d_k
+        self.h = n_head
+        self.layer_norm = torch.nn.LayerNorm(in_channels)
+        self.linear_q = torch.nn.Linear(in_channels, in_channels)
+        self.linear_k = torch.nn.Linear(in_channels, in_channels)
+        self.linear_v = torch.nn.Linear(in_channels, in_channels)
+        self.linear_out = torch.nn.Linear(in_channels, in_channels)
+        self.attn = None
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x, pos_k, mask):
+        """
+        Compute 'Scaled Dot Product Attention'.
+            :param torch.Tensor mask: (batch, time1, time2)
+            :param torch.nn.Dropout dropout:
+            :return torch.Tensor: attentined and transformed `value` (batch, time1, d_model)
+            weighted by the query dot key attention (batch, head, time1, time2)
+        """
+        n_batch = x.size(0)
+        x = self.layer_norm(x)
+        q = self.linear_q(x).view(n_batch, -1, self.h, self.d_k)  #(b, t, d)
+        k = self.linear_k(x).view(n_batch, -1, self.h, self.d_k)  #(b, t, d)
+        v = self.linear_v(x).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        A = torch.matmul(q, k.transpose(-2, -1))
+        reshape_q = q.contiguous().view(n_batch * self.h, -1, self.d_k).transpose(0,1)
+        if pos_k is not None:
+            B = torch.matmul(reshape_q, pos_k.transpose(-2, -1))
+            B = B.transpose(0, 1).view(n_batch, self.h, pos_k.size(0), pos_k.size(1))
+            scores = (A + B) / math.sqrt(self.d_k)
+        else:
+            scores = A / math.sqrt(self.d_k)
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
+            min_value = float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
+            scores = scores.masked_fill(mask, min_value)
+            self.attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0)  # (batch, head, time1, time2)
+        else:
+            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(self.attn)
+        x = torch.matmul(p_attn, v)  # (batch, head, time1, d_k)
+        x = x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+        return self.Layer_scale(self.dropout(self.linear_out(x)))  # (batch, time1, d_model)
+class EGA(torch.nn.Module):
+    def __init__(self, in_channels: int, num_mha_heads: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'self_attn': MultiHeadAttention(
+                n_head=num_mha_heads, in_channels=in_channels, dropout_rate=dropout_rate),
+            'linear': torch.nn.Sequential(
+                torch.nn.LayerNorm(normalized_shape=in_channels),
+                torch.nn.Linear(in_features=in_channels, out_features=in_channels),
+                torch.nn.Sigmoid())
+        })
+    def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+        """
+        Compute encoded features.
+            :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+            :param torch.Tensor mask: mask for x (batch, max_time_in)
+            :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        down_len = pos_k.shape[0]
+        x_down = torch.nn.functional.adaptive_avg_pool1d(input=x, output_size=down_len)
+        x = x.permute([0, 2, 1])
+        x_down = x_down.permute([0, 2, 1])
+        x_down = self.block['self_attn'](x_down, pos_k, None)
+        x_down = x_down.permute([0, 2, 1])
+        x_downup = torch.nn.functional.upsample(input=x_down, size=x.shape[1])
+        x_downup = x_downup.permute([0, 2, 1])
+        x = x + self.block['linear'](x) * x_downup
+        return x
+class CLA(torch.nn.Module):
+    def __init__(self, in_channels, kernel_size, dropout_rate, Layer_scale_init=1.0e-5):
+        super().__init__()
+        self.layer_norm = torch.nn.LayerNorm(in_channels)
+        self.linear1 = torch.nn.Linear(in_channels, in_channels*2)
+        self.GLU = torch.nn.GLU()
+        self.dw_conv_1d = torch.nn.Conv1d(in_channels, in_channels, kernel_size, padding='same', groups=in_channels)
+        self.linear2 = torch.nn.Linear(in_channels, 2*in_channels)
+        self.BN = torch.nn.BatchNorm1d(2*in_channels)
+        self.linear3 = torch.nn.Sequential(
+            torch.nn.GELU(),
+            torch.nn.Linear(2*in_channels, in_channels),
+            torch.nn.Dropout(dropout_rate))
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x):
+        y = self.layer_norm(x)
+        y = self.linear1(y)
+        y = self.GLU(y)
+        y = y.permute([0, 2, 1]) # B, F, T
+        y = self.dw_conv_1d(y)
+        y = y.permute(0, 2, 1) # B, T, 2F
+        y = self.linear2(y)
+        y = y.permute(0, 2, 1) # B, T, 2F
+        y = self.BN(y)
+        y = y.permute(0, 2, 1) # B, T, 2F
+        y = self.linear3(y)
+        return x + self.Layer_scale(y)
+class GlobalBlock(torch.nn.Module):
+    def __init__(self, in_channels: int, num_mha_heads: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'ega': EGA(
+                num_mha_heads=num_mha_heads, in_channels=in_channels, dropout_rate=dropout_rate),
+            'gcfn': GCFN(in_channels=in_channels, dropout_rate=dropout_rate)
+        })
+    def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+        """
+        Compute encoded features.
+            :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+            :param torch.Tensor mask: mask for x (batch, max_time_in)
+            :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        x = self.block['ega'](x, pos_k)
+        x = self.block['gcfn'](x)
+        x = x.permute([0, 2, 1])
+        return x
+class LocalBlock(torch.nn.Module):
+    def __init__(self, in_channels: int, kernel_size: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'cla': CLA(in_channels, kernel_size, dropout_rate),
+            'gcfn': GCFN(in_channels, dropout_rate)
+        })
+    def forward(self, x: torch.Tensor):
+        x = self.block['cla'](x)
+        x = self.block['gcfn'](x)
+        return x
+class SpkAttention(torch.nn.Module):
+    def __init__(self, in_channels: int, num_mha_heads: int, dropout_rate: float):
+        super().__init__()
+        self.self_attn = MultiHeadAttention(n_head=num_mha_heads, in_channels=in_channels, dropout_rate=dropout_rate)
+        self.feed_forward = GCFN(in_channels=in_channels, dropout_rate=dropout_rate)
+    def forward(self, x: torch.Tensor, num_spk: int):
+        """
+        Compute encoded features.
+            :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+            :param torch.Tensor mask: mask for x (batch, max_time_in)
+            :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        B, F, T = x.shape
+        x = x.view(B//num_spk, num_spk, F, T).contiguous()
+        x = x.permute([0, 3, 1, 2]).contiguous()
+        x = x.view(-1, num_spk, F).contiguous()
+        x = x + self.self_attn(x, None, None)
+        x = x.view(B//num_spk, T, num_spk, F).contiguous()
+        x = x.permute([0, 2, 3, 1]).contiguous()
+        x = x.view(B, F, T).contiguous()
+        x = x.permute([0, 2, 1])
+        x = self.feed_forward(x)
+        x = x.permute([0, 2, 1])
+        return x

models/SepReformer/SepReformer_Large_DM_WHAMR/configs.yaml ADDED Viewed

	@@ -0,0 +1,131 @@

+config:
+    dataset:
+        max_len : 32000
+        sampling_rate: 8000
+        scp_dir: "data/scp_ss_8k_whamr"
+        train:
+            mixture: "tr_mix.scp"
+            spk1: "tr_s1.scp"
+            spk2: "tr_s2.scp"
+            spk1_reverb: "tr_s1_reverb.scp"
+            spk2_reverb: "tr_s2_reverb.scp"
+            noise: "tr_n.scp"
+            dynamic_mixing: true
+        valid:
+            mixture: "cv_mix.scp"
+            spk1: "cv_s1.scp"
+            spk2: "cv_s2.scp"
+        test:
+            mixture: "tt_mix.scp"
+            spk1: "tt_s1.scp"
+            spk2: "tt_s2.scp"
+    dataloader:
+        batch_size: 2
+        pin_memory: false
+        num_workers: 12
+        drop_last: false
+    model:
+        num_stages: &var_model_num_stages 4 # R
+        num_spks: &var_model_num_spks 2
+        module_audio_enc:
+            in_channels: 1
+            out_channels: &var_model_audio_enc_out_channels 256
+            kernel_size: &var_model_audio_enc_kernel_size 16 # L
+            stride: &var_model_audio_enc_stride 4 # S
+            groups: 1
+            bias: false
+        module_feature_projector:
+            num_channels: *var_model_audio_enc_out_channels
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: &feature_projector_out_channels 256 # F
+            kernel_size: 1
+            bias: false
+        module_separator:
+            num_stages: *var_model_num_stages
+            relative_positional_encoding:
+                in_channels: *feature_projector_out_channels
+                num_heads: 8
+                maxlen: 2000
+                embed_v: false
+            enc_stage:
+                global_blocks:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.1
+                local_blocks:
+                    in_channels: *feature_projector_out_channels
+                    kernel_size: 65
+                    dropout_rate: 0.1
+                down_conv_layer:
+                    in_channels: *feature_projector_out_channels
+                    samp_kernel_size: &var_model_samp_kernel_size 5
+            spk_split_stage:
+                in_channels: *feature_projector_out_channels
+                num_spks: *var_model_num_spks
+            simple_fusion:
+                out_channels: *feature_projector_out_channels
+            dec_stage:
+                num_spks: *var_model_num_spks
+                global_blocks:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.1
+                local_blocks:
+                    in_channels: *feature_projector_out_channels
+                    kernel_size: 65
+                    dropout_rate: 0.1
+                spk_attention:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.1
+        module_output_layer:
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: *feature_projector_out_channels
+            num_spks: *var_model_num_spks
+        module_audio_dec:
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: 1
+            kernel_size: *var_model_audio_enc_kernel_size
+            stride: *var_model_audio_enc_stride
+            bias: false
+    criterion: ### Ref: https://pytorch.org/docs/stable/nn.html#loss-functions
+        name: ["PIT_SISNR_mag", "PIT_SISNR_time", "PIT_SISNRi", "PIT_SDRi"] ### Choose a torch.nn's loss function class(=attribute) e.g. ["L1Loss", "MSELoss", "CrossEntropyLoss", ...] / You can also build your optimizer :)
+        PIT_SISNR_mag:
+            frame_length: 512
+            frame_shift: 128
+            window: 'hann'
+            num_stages: *var_model_num_stages
+            num_spks: *var_model_num_spks
+            scale_inv: true
+            mel_opt: false
+        PIT_SISNR_time:
+            num_spks: *var_model_num_spks
+            scale_inv: true
+        PIT_SISNRi:
+            num_spks: *var_model_num_spks
+            scale_inv: true
+        PIT_SDRi:
+            dump: 0
+    optimizer: ### Ref: https://pytorch.org/docs/stable/optim.html#algorithms
+        name: ["AdamW"] ### Choose a torch.optim's class(=attribute) e.g. ["Adam", "AdamW", "SGD", ...] / You can also build your optimizer :)
+        AdamW:
+            lr: 2.0e-4
+            weight_decay: 1.0e-2
+    scheduler: ### Ref(+ find "How to adjust learning rate"): https://pytorch.org/docs/stable/optim.html#algorithms
+        name: ["ReduceLROnPlateau", "WarmupConstantSchedule"] ### Choose a torch.optim.lr_scheduler's class(=attribute) e.g. ["StepLR", "ReduceLROnPlateau", "Custom"] / You can also build your scheduler :)
+        ReduceLROnPlateau:
+            mode: "min"
+            min_lr: 1.0e-10
+            factor: 0.8
+            patience: 2
+        WarmupConstantSchedule:
+            warmup_steps: 1000
+    check_computations:
+        dummy_len: 16000
+    engine:
+        max_epoch: 200
+        gpuid: "0" ### "0"(single-gpu) or "0, 1" (multi-gpu)
+        mvn: false
+        clip_norm: 5
+        start_scheduling: 50
+        test_epochs: [50, 80, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 199]

models/SepReformer/SepReformer_Large_DM_WHAMR/dataset.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import os
+import torch
+import random
+import librosa as audio_lib
+import numpy as np
+from utils import util_dataset
+from utils.decorators import *
+from loguru import logger
+from torch.utils.data import Dataset, DataLoader
+@logger_wraps()
+def get_dataloaders(args, dataset_config, loader_config):
+    # create dataset object for each partition
+    partitions = ["test"] if "test" in args.engine_mode  else ["train", "valid", "test"]
+    dataloaders = {}
+    for partition in partitions:
+        scp_config_mix = os.path.join(dataset_config["scp_dir"], dataset_config[partition]['mixture'])
+        scp_config_spk = [os.path.join(dataset_config["scp_dir"], dataset_config[partition][spk_key]) for spk_key in dataset_config[partition] if spk_key.startswith('spk')]
+        scp_config_noise = os.path.join(dataset_config["scp_dir"], dataset_config[partition]['noise']) if 'noise' in dataset_config[partition] else None
+        dynamic_mixing = dataset_config[partition]["dynamic_mixing"] if partition == 'train' else False
+        dataset = MyDataset(
+            max_len = dataset_config['max_len'],
+            fs = dataset_config['sampling_rate'],
+            partition = partition,
+            wave_scp_srcs = scp_config_spk,
+            wave_scp_mix = scp_config_mix,
+            wave_scp_noise = scp_config_noise,
+            dynamic_mixing = dynamic_mixing)
+        dataloader = DataLoader(
+            dataset = dataset,
+            batch_size = 1 if partition == 'test' else loader_config["batch_size"],
+            shuffle = True, # only train: (partition == 'train') / all: True
+            pin_memory = loader_config["pin_memory"],
+            num_workers = loader_config["num_workers"],
+            drop_last = loader_config["drop_last"],
+            collate_fn = _collate)
+        dataloaders[partition] = dataloader
+    return dataloaders
+def _collate(egs):
+    """
+        Transform utterance index into a minbatch
+        Arguments:
+            index: a list type [{},{},{}]
+        Returns:
+            input_sizes: a tensor correspond to utterance length
+            input_feats: packed sequence to feed networks
+            source_attr/target_attr: dictionary contains spectrogram/phase needed in loss computation
+    """
+    def __prepare_target_rir(dict_lsit, index):
+        return torch.nn.utils.rnn.pad_sequence([torch.tensor(d["src"][index], dtype=torch.float32)  for d in dict_lsit], batch_first=True)
+    if type(egs) is not list: raise ValueError("Unsupported index type({})".format(type(egs)))
+    num_spks = 2 # you need to set this paramater by yourself
+    dict_list = sorted([eg for eg in egs], key=lambda x: x['num_sample'], reverse=True)
+    mixture = torch.nn.utils.rnn.pad_sequence([torch.tensor(d['mix'], dtype=torch.float32) for d in dict_list], batch_first=True)
+    src = [__prepare_target_rir(dict_list, index) for index in range(num_spks)]
+    input_sizes = torch.tensor([d['num_sample'] for d in dict_list], dtype=torch.float32)
+    key = [d['key'] for d in dict_list]
+    return input_sizes, mixture, src, key
+@logger_wraps()
+class MyDataset(Dataset):
+    def __init__(self, max_len, fs, partition, wave_scp_srcs, wave_scp_mix, wave_scp_noise, dynamic_mixing, speed_list=None):
+        self.partition = partition
+        for wave_scp_src in wave_scp_srcs:
+            if not os.path.exists(wave_scp_src): raise FileNotFoundError(f"Could not find file {wave_scp_src}")
+        self.max_len = max_len
+        self.fs = fs
+        self.wave_dict_srcs = [util_dataset.parse_scps(wave_scp_src) for wave_scp_src in wave_scp_srcs]
+        self.wave_dict_mix = util_dataset.parse_scps(wave_scp_mix)
+        self.wave_dict_noise = util_dataset.parse_scps(wave_scp_noise) if wave_scp_noise else None
+        self.wave_keys = list(self.wave_dict_mix.keys())
+        logger.info(f"Create MyDataset for {wave_scp_mix} with {len(self.wave_dict_mix)} utterances")
+        self.dynamic_mixing = dynamic_mixing
+    def __len__(self):
+        return len(self.wave_dict_mix)
+    def __contains__(self, key):
+        return key in self.wave_dict_mix
+    def _dynamic_mixing(self, key):
+        def __match_length(wav, len_data):
+            leftover = len(wav) - len_data
+            idx = random.randint(0,leftover)
+            wav = wav[idx:idx+len_data]
+            return wav
+        samps_src_reverb = []
+        samps_src = []
+        src_len = [self.max_len]
+        # dyanmic source choice
+        # checking whether it is the same speaker
+        key_random = random.choice(list(self.wave_dict_srcs[0].keys()))
+        idx1, idx2 = (0, 1) if random.random() > 0.5 else (1, 0)
+        files = [self.wave_dict_srcs[idx1][key], self.wave_dict_srcs[idx2][key_random]]
+        files_reverb = [self.wave_dict_srcs[idx1+2][key], self.wave_dict_srcs[idx2+2][key_random]]
+        # load
+        for idx, file in enumerate(files_reverb):
+            if not os.path.exists(file):
+                raise FileNotFoundError("Input file {} do not exists!".format(file))
+            samps_tmp_reverb, _ = audio_lib.load(file, sr=self.fs)
+            samps_tmp, _ = audio_lib.load(files[idx], sr=self.fs)
+            # mixing with random gains
+            if idx == 0: ref_rms = np.sqrt(np.mean(np.square(samps_tmp)))
+            curr_rms = np.sqrt(np.mean(np.square(samps_tmp)))
+            norm_factor = ref_rms / curr_rms
+            samps_tmp *= norm_factor
+            samps_tmp_reverb *= norm_factor
+            gain = pow(10,-random.uniform(-3,3)/20)
+            # Speed Augmentation
+            samps_src_reverb.append(gain*samps_tmp_reverb)
+            samps_src.append(gain*samps_tmp)
+            src_len.append(len(samps_tmp))
+        # matching the audio length
+        min_len = min(src_len)
+        # add noise source
+        file_noise = self.wave_dict_noise[key]
+        samps_noise, _ = audio_lib.load(file_noise, sr=self.fs)
+        curr_rms = np.sqrt(np.mean(np.square(samps_noise)))
+        norm_factor = ref_rms / curr_rms
+        samps_noise *= norm_factor
+        gain_noise = pow(10,-random.uniform(-6,3)/20)
+        samps_noise = samps_noise*gain_noise
+        src_len.append(len(samps_noise))
+        # truncate
+        min_len = min(src_len)
+        samps_src_stack = [np.stack([samps_src_reverb[idx], samps_src[idx]],axis=-1) for idx in range(len(samps_src_reverb))]
+        samps_src_stack = [__match_length(s, min_len) for s in samps_src_stack]
+        samps_src_reverb = [s[...,0] for s in samps_src_stack]
+        samps_src = [s[...,1] for s in samps_src_stack]
+        samps_noise = __match_length(samps_noise, min_len)
+        samps_mix = sum(samps_src_reverb) + samps_noise
+        if len(samps_mix)%4 != 0:
+            remains = len(samps_mix)%4
+            samps_mix = samps_mix[:-remains]
+            samps_src = [s[:-remains] for s in samps_src]
+        return samps_mix, samps_src
+    def _direct_load(self, key):
+        samps_src = []
+        files = [self.wave_dict_srcs[0][key], self.wave_dict_srcs[1][key]]
+        # files = [wave_dict_src[key] for wave_dict_src in self.wave_dict_srcs]
+        for file in files:
+            if not os.path.exists(file): raise FileNotFoundError(f"Input file {file} do not exists!")
+            samps_tmp, _ = audio_lib.load(file, sr=self.fs)
+            samps_src.append(samps_tmp)
+        file = self.wave_dict_mix[key]
+        if not os.path.exists(file): raise FileNotFoundError(f"Input file {file} do not exists!")
+        samps_mix, _ = audio_lib.load(file, sr=self.fs)
+        # Truncate samples as needed
+        if len(samps_mix) % 4 != 0:
+            remains = len(samps_mix) % 4
+            samps_mix = samps_mix[:-remains]
+            samps_src = [s[:-remains] for s in samps_src]
+        if self.partition != "test":
+            if len(samps_mix) > self.max_len:
+                start = random.randint(0,len(samps_mix)-self.max_len)
+                samps_mix = samps_mix[start:start+self.max_len]
+                samps_src = [s[start:start+self.max_len] for s in samps_src]
+        return samps_mix, samps_src
+    def __getitem__(self, index):
+        key = self.wave_keys[index]
+        if any(key not in self.wave_dict_srcs[i] for i in range(len(self.wave_dict_srcs)-2)) or key not in self.wave_dict_mix: raise KeyError(f"Could not find utterance {key}")
+        samps_mix, samps_src = self._dynamic_mixing(key) if self.dynamic_mixing else self._direct_load(key)
+        return {"num_sample": samps_mix.shape[0], "mix": samps_mix, "src": samps_src, "key": key}

models/SepReformer/SepReformer_Large_DM_WHAMR/engine.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import os
+import torch
+import csv
+import time
+import soundfile as sf
+from loguru import logger
+from tqdm import tqdm
+from utils import util_engine, functions
+from utils.decorators import *
+from torch.utils.tensorboard import SummaryWriter
+@logger_wraps()
+class Engine(object):
+    def __init__(self, args, config, model, dataloaders, criterions, optimizers, schedulers, gpuid, device):
+        ''' Default setting '''
+        self.engine_mode = args.engine_mode
+        self.out_wav_dir = args.out_wav_dir
+        self.config = config
+        self.gpuid = gpuid
+        self.device = device
+        self.model = model.to(self.device)
+        self.dataloaders = dataloaders # self.dataloaders['train'] or ['valid'] or ['test']
+        self.PIT_SISNR_mag_loss, self.PIT_SISNR_time_loss, self.PIT_SISNRi_loss, self.PIT_SDRi_loss = criterions
+        self.main_optimizer = optimizers[0]
+        self.main_scheduler, self.warmup_scheduler = schedulers
+        self.pretrain_weights_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log", "pretrain_weights")
+        os.makedirs(self.pretrain_weights_path, exist_ok=True)
+        self.scratch_weights_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log", "scratch_weights")
+        os.makedirs(self.scratch_weights_path, exist_ok=True)
+        self.checkpoint_path = self.pretrain_weights_path if any(file.endswith(('.pt', '.pt', '.pkl')) for file in os.listdir(self.pretrain_weights_path)) else self.scratch_weights_path
+        self.start_epoch = util_engine.load_last_checkpoint_n_get_epoch(self.checkpoint_path, self.model, self.main_optimizer, location=self.device)
+        # Logging
+        util_engine.model_params_mac_summary(
+            model=self.model,
+            input=torch.randn(1, self.config['check_computations']['dummy_len']).to(self.device),
+            dummy_input=torch.rand(1, self.config['check_computations']['dummy_len']).to(self.device),
+            metrics=['ptflops', 'thop', 'torchinfo']
+            # metrics=['ptflops']
+        )
+        logger.info(f"Clip gradient by 2-norm {self.config['engine']['clip_norm']}")
+    @logger_wraps()
+    def _train(self, dataloader, epoch):
+        self.model.train()
+        tot_loss_freq = [0 for _ in range(self.model.num_stages)]
+        tot_loss_time, num_batch = 0, 0
+        pbar = tqdm(total=len(dataloader), unit='batches', bar_format='{l_bar}{bar:25}{r_bar}{bar:-10b}', colour="YELLOW", dynamic_ncols=True)
+        for input_sizes, mixture, src, _ in dataloader:
+            nnet_input = mixture
+            nnet_input = functions.apply_cmvn(nnet_input) if self.config['engine']['mvn'] else nnet_input
+            num_batch += 1
+            pbar.update(1)
+            # Scheduler learning rate for warm-up (Iteration-based update for transformers)
+            if epoch == 1: self.warmup_scheduler.step()
+            nnet_input = nnet_input.to(self.device)
+            self.main_optimizer.zero_grad()
+            estim_src, estim_src_bn = torch.nn.parallel.data_parallel(self.model, nnet_input, device_ids=self.gpuid)
+            cur_loss_s_bn = 0
+            cur_loss_s_bn = []
+            for idx, estim_src_value in enumerate(estim_src_bn):
+                cur_loss_s_bn.append(self.PIT_SISNR_mag_loss(estims=estim_src_value, idx=idx, input_sizes=input_sizes, target_attr=src))
+                tot_loss_freq[idx] += cur_loss_s_bn[idx].item() / (self.config['model']['num_spks'])
+            cur_loss_s = self.PIT_SISNR_time_loss(estims=estim_src, input_sizes=input_sizes, target_attr=src)
+            tot_loss_time += cur_loss_s.item() / self.config['model']['num_spks']
+            alpha = 0.4 * 0.8**(1+(epoch-101)//5) if epoch > 100 else 0.4
+            cur_loss = (1-alpha) * cur_loss_s + alpha * sum(cur_loss_s_bn) / len(cur_loss_s_bn)
+            cur_loss = cur_loss / self.config['model']['num_spks']
+            cur_loss.backward()
+            if self.config['engine']['clip_norm']: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config['engine']['clip_norm'])
+            self.main_optimizer.step()
+            dict_loss = {"T_Loss": tot_loss_time / num_batch}
+            dict_loss.update({'F_Loss_' + str(idx): loss / num_batch for idx, loss in enumerate(tot_loss_freq)})
+            pbar.set_postfix(dict_loss)
+        pbar.close()
+        tot_loss_freq = sum(tot_loss_freq) / len(tot_loss_freq)
+        return tot_loss_time / num_batch, tot_loss_freq / num_batch, num_batch
+    @logger_wraps()
+    def _validate(self, dataloader):
+        self.model.eval()
+        tot_loss_freq = [0 for _ in range(self.model.num_stages)]
+        tot_loss_time, num_batch = 0, 0
+        pbar = tqdm(total=len(dataloader), unit='batches', bar_format='{l_bar}{bar:5}{r_bar}{bar:-10b}', colour="RED", dynamic_ncols=True)
+        with torch.inference_mode():
+            for input_sizes, mixture, src, _ in dataloader:
+                nnet_input = mixture
+                nnet_input = functions.apply_cmvn(nnet_input) if self.config['engine']['mvn'] else nnet_input
+                nnet_input = nnet_input.to(self.device)
+                num_batch += 1
+                pbar.update(1)
+                estim_src, estim_src_bn = torch.nn.parallel.data_parallel(self.model, nnet_input, device_ids=self.gpuid)
+                cur_loss_s_bn = []
+                for idx, estim_src_value in enumerate(estim_src_bn):
+                    cur_loss_s_bn.append(self.PIT_SISNR_mag_loss(estims=estim_src_value, idx=idx, input_sizes=input_sizes, target_attr=src))
+                    tot_loss_freq[idx] += cur_loss_s_bn[idx].item() / (self.config['model']['num_spks'])
+                cur_loss_s_SDR = self.PIT_SISNR_time_loss(estims=estim_src, input_sizes=input_sizes, target_attr=src)
+                tot_loss_time += cur_loss_s_SDR.item() / self.config['model']['num_spks']
+                dict_loss = {"T_Loss":tot_loss_time / num_batch}
+                dict_loss.update({'F_Loss_' + str(idx): loss / num_batch for idx, loss in enumerate(tot_loss_freq)})
+                pbar.set_postfix(dict_loss)
+        pbar.close()
+        tot_loss_freq = sum(tot_loss_freq) / len(tot_loss_freq)
+        return tot_loss_time / num_batch, tot_loss_freq / num_batch, num_batch
+    @logger_wraps()
+    def _test(self, dataloader, wav_dir=None):
+        self.model.eval()
+        total_loss_SISNRi, total_loss_SDRi, num_batch = 0, 0, 0
+        pbar = tqdm(total=len(dataloader), unit='batches', bar_format='{l_bar}{bar:5}{r_bar}{bar:-10b}', colour="grey", dynamic_ncols=True)
+        with torch.inference_mode():
+            csv_file_name_sisnr = os.path.join(os.path.dirname(__file__),'test_SISNRi_value.csv')
+            csv_file_name_sdr = os.path.join(os.path.dirname(__file__),'test_SDRi_value.csv')
+            with open(csv_file_name_sisnr, 'w', newline='') as csvfile_sisnr, open(csv_file_name_sdr, 'w', newline='') as csvfile_sdr:
+                idx = 0
+                writer_sisnr = csv.writer(csvfile_sisnr, quotechar='|', quoting=csv.QUOTE_MINIMAL)
+                writer_sdr = csv.writer(csvfile_sdr, quotechar='|', quoting=csv.QUOTE_MINIMAL)
+                for input_sizes, mixture, src, key in dataloader:
+                    if len(key) > 1:
+                        raise("batch size is not one!!")
+                    nnet_input = mixture.to(self.device)
+                    num_batch += 1
+                    pbar.update(1)
+                    estim_src, _ = torch.nn.parallel.data_parallel(self.model, nnet_input, device_ids=self.gpuid)
+                    cur_loss_SISNRi, cur_loss_SISNRi_src = self.PIT_SISNRi_loss(estims=estim_src, mixture=mixture, input_sizes=input_sizes, target_attr=src, eps=1.0e-15)
+                    total_loss_SISNRi += cur_loss_SISNRi.item() / self.config['model']['num_spks']
+                    cur_loss_SDRi, cur_loss_SDRi_src = self.PIT_SDRi_loss(estims=estim_src, mixture=mixture, input_sizes=input_sizes, target_attr=src)
+                    total_loss_SDRi += cur_loss_SDRi.item() / self.config['model']['num_spks']
+                    writer_sisnr.writerow([key[0][:-4]] + [cur_loss_SISNRi_src[i].item() for i in range(self.config['model']['num_spks'])])
+                    writer_sdr.writerow([key[0][:-4]] + [cur_loss_SDRi_src[i].item() for i in range(self.config['model']['num_spks'])])
+                    if self.engine_mode == "test_save":
+                        if wav_dir == None: wav_dir = os.path.join(os.path.dirname(__file__),"wav_out")
+                        if wav_dir and not os.path.exists(wav_dir): os.makedirs(wav_dir)
+                        mixture = torch.squeeze(mixture).cpu().data.numpy()
+                        sf.write(os.path.join(wav_dir,key[0][:-4]+str(idx)+'_mixture.wav'), 0.5*mixture/max(abs(mixture)), 8000)
+                        for i in range(self.config['model']['num_spks']):
+                            src = torch.squeeze(estim_src[i]).cpu().data.numpy()
+                            sf.write(os.path.join(wav_dir,key[0][:-4]+str(idx)+'_out_'+str(i)+'.wav'), 0.5*src/max(abs(src)), 8000)
+                    idx += 1
+                    dict_loss = {"SiSNRi": total_loss_SISNRi/num_batch, "SDRi": total_loss_SDRi/num_batch}
+                    pbar.set_postfix(dict_loss)
+        pbar.close()
+        return total_loss_SISNRi/num_batch, total_loss_SDRi/num_batch, num_batch
+    @logger_wraps()
+    def run(self):
+        with torch.cuda.device(self.device):
+            writer_src = SummaryWriter(os.path.join(os.path.dirname(os.path.abspath(__file__)), "log/tensorboard"))
+            if "test" in self.engine_mode:
+                on_test_start = time.time()
+                test_loss_src_time_1, test_loss_src_time_2, test_num_batch = self._test(self.dataloaders['test'], self.out_wav_dir)
+                on_test_end = time.time()
+                logger.info(f"[TEST] Loss(time/mini-batch) \n - Epoch {self.start_epoch:2d}: SISNRi = {test_loss_src_time_1:.4f} dB | SDRi = {test_loss_src_time_2:.4f} dB | Speed = ({on_test_end - on_test_start:.2f}s/{test_num_batch:d})")
+                logger.info(f"Testing done!")
+            else:
+                start_time = time.time()
+                if self.start_epoch > 1:
+                    init_loss_time, init_loss_freq, valid_num_batch = self._validate(self.dataloaders['valid'])
+                else:
+                    init_loss_time, init_loss_freq = 0, 0
+                end_time = time.time()
+                logger.info(f"[INIT] Loss(time/mini-batch) \n - Epoch {self.start_epoch:2d}: Loss_t = {init_loss_time:.4f} dB | Loss_f = {init_loss_freq:.4f} dB | Speed = ({end_time-start_time:.2f}s)")
+                for epoch in range(self.start_epoch, self.config['engine']['max_epoch']):
+                    valid_loss_best = init_loss_time
+                    train_start_time = time.time()
+                    train_loss_src_time, train_loss_src_freq, train_num_batch = self._train(self.dataloaders['train'], epoch)
+                    train_end_time = time.time()
+                    valid_start_time = time.time()
+                    valid_loss_src_time, valid_loss_src_freq, valid_num_batch = self._validate(self.dataloaders['valid'])
+                    valid_end_time = time.time()
+                    if epoch > self.config['engine']['start_scheduling']: self.main_scheduler.step(valid_loss_src_time)
+                    logger.info(f"[TRAIN] Loss(time/mini-batch) \n - Epoch {epoch:2d}: Loss_t = {train_loss_src_time:.4f} dB | Loss_f = {train_loss_src_freq:.4f} dB | Speed = ({train_end_time - train_start_time:.2f}s/{train_num_batch:d})")
+                    logger.info(f"[VALID] Loss(time/mini-batch) \n - Epoch {epoch:2d}: Loss_t = {valid_loss_src_time:.4f} dB | Loss_f = {valid_loss_src_freq:.4f} dB | Speed = ({valid_end_time - valid_start_time:.2f}s/{valid_num_batch:d})")
+                    if epoch in self.config['engine']['test_epochs']:
+                        on_test_start = time.time()
+                        test_loss_src_time_1, test_loss_src_time_2, test_num_batch = self._test(self.dataloaders['test'])
+                        on_test_end = time.time()
+                        logger.info(f"[TEST] Loss(time/mini-batch) \n - Epoch {epoch:2d}: SISNRi = {test_loss_src_time_1:.4f} dB | SDRi = {test_loss_src_time_2:.4f} dB | Speed = ({on_test_end - on_test_start:.2f}s/{test_num_batch:d})")
+                    valid_loss_best = util_engine.save_checkpoint_per_best(valid_loss_best, valid_loss_src_time, train_loss_src_time, epoch, self.model, self.main_optimizer, self.checkpoint_path)
+                    # Logging to monitoring tools (Tensorboard && Wandb)
+                    writer_src.add_scalars("Metrics", {
+                        'Loss_train_time': train_loss_src_time,
+                        'Loss_valid_time': valid_loss_src_time}, epoch)
+                    writer_src.add_scalar("Learning Rate", self.main_optimizer.param_groups[0]['lr'], epoch)
+                    writer_src.flush()
+                logger.info(f"Training for {self.config['engine']['max_epoch']} epoches done!")

models/SepReformer/SepReformer_Large_DM_WHAMR/main.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import torch
+from loguru import logger
+from .dataset import get_dataloaders
+from .model import Model
+from .engine import Engine
+from utils import util_system, util_implement
+from utils.decorators import *
+# Setup logger
+log_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log/system_log.log")
+logger.add(log_file_path, level="DEBUG", mode="w")
+@logger_wraps()
+def main(args):
+    ''' Build Setting '''
+    # Call configuration file (configs.yaml)
+    yaml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs.yaml")
+    yaml_dict = util_system.parse_yaml(yaml_path)
+    # Run wandb and get configuration
+    config = yaml_dict["config"] # wandb login success or fail
+    # Call DataLoader [train / valid / test / etc...]
+    dataloaders = get_dataloaders(args, config["dataset"], config["dataloader"])
+    ''' Build Model '''
+    # Call network model
+    model = Model(**config["model"])
+    ''' Build Engine '''
+    # Call gpu id & device
+    gpuid = tuple(map(int, config["engine"]["gpuid"].split(',')))
+    device = torch.device(f'cuda:{gpuid[0]}')
+    # Call Implement [criterion / optimizer / scheduler]
+    criterions = util_implement.CriterionFactory(config["criterion"], device).get_criterions()
+    optimizers = util_implement.OptimizerFactory(config["optimizer"], model.parameters()).get_optimizers()
+    schedulers = util_implement.SchedulerFactory(config["scheduler"], optimizers).get_schedulers()
+    # Call & Run Engine
+    engine = Engine(args, config, model, dataloaders, criterions, optimizers, schedulers, gpuid, device)
+    engine.run()

models/SepReformer/SepReformer_Large_DM_WHAMR/model.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import sys
+sys.path.append('../')
+import torch
+import warnings
+warnings.filterwarnings('ignore')
+from utils.decorators import *
+from .modules.module import *
+@logger_wraps()
+class Model(torch.nn.Module):
+    def __init__(self,
+                 num_stages: int,
+                 num_spks: int,
+                 module_audio_enc: dict,
+                 module_feature_projector: dict,
+                 module_separator: dict,
+                 module_output_layer: dict,
+                 module_audio_dec: dict):
+        super().__init__()
+        self.num_stages = num_stages
+        self.num_spks = num_spks
+        self.audio_encoder = AudioEncoder(**module_audio_enc)
+        self.feature_projector = FeatureProjector(**module_feature_projector)
+        self.separator = Separator(**module_separator)
+        self.out_layer = OutputLayer(**module_output_layer)
+        self.audio_decoder = AudioDecoder(**module_audio_dec)
+        # Aux_loss
+        self.out_layer_bn = torch.nn.ModuleList([])
+        self.decoder_bn = torch.nn.ModuleList([])
+        for _ in range(self.num_stages):
+            self.out_layer_bn.append(OutputLayer(**module_output_layer, masking=True))
+            self.decoder_bn.append(AudioDecoder(**module_audio_dec))
+    def forward(self, x):
+        encoder_output = self.audio_encoder(x)
+        projected_feature = self.feature_projector(encoder_output)
+        last_stage_output, each_stage_outputs = self.separator(projected_feature)
+        out_layer_output = self.out_layer(last_stage_output, encoder_output)
+        each_spk_output = [out_layer_output[idx] for idx in range(self.num_spks)]
+        audio = [self.audio_decoder(each_spk_output[idx]) for idx in range(self.num_spks)]
+        # Aux_loss
+        audio_aux = []
+        for idx, each_stage_output in enumerate(each_stage_outputs):
+            each_stage_output = self.out_layer_bn[idx](torch.nn.functional.upsample(each_stage_output, encoder_output.shape[-1]), encoder_output)
+            out_aux = [each_stage_output[jdx] for jdx in range(self.num_spks)]
+            audio_aux.append([self.decoder_bn[idx](out_aux[jdx])[...,:x.shape[-1]] for jdx in range(self.num_spks)])
+        return audio, audio_aux

models/SepReformer/SepReformer_Large_DM_WHAMR/modules/__pycache__/module.cpython-310.pyc ADDED Viewed

Binary file (11.1 kB). View file

models/SepReformer/SepReformer_Large_DM_WHAMR/modules/__pycache__/module.cpython-38.pyc ADDED Viewed

Binary file (11 kB). View file

models/SepReformer/SepReformer_Large_DM_WHAMR/modules/__pycache__/network.cpython-310.pyc ADDED Viewed

Binary file (8.98 kB). View file

models/SepReformer/SepReformer_Large_DM_WHAMR/modules/__pycache__/network.cpython-38.pyc ADDED Viewed

Binary file (9.09 kB). View file

models/SepReformer/SepReformer_Large_DM_WHAMR/modules/module.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import sys
+sys.path.append('../')
+import torch
+import warnings
+warnings.filterwarnings('ignore')
+from utils.decorators import *
+from .network import *
+class AudioEncoder(torch.nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int, groups: int, bias: bool):
+        super().__init__()
+        self.conv1d = torch.nn.Conv1d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, groups=groups, bias=bias)
+        self.gelu = torch.nn.GELU()
+    def forward(self, x: torch.Tensor):
+        x = torch.unsqueeze(x, dim=0) if len(x.shape) == 1 else torch.unsqueeze(x, dim=1) # [T] - >[1, T] OR [B, T] -> [B, 1, T]
+        x = self.conv1d(x)
+        x = self.gelu(x)
+        return x
+class FeatureProjector(torch.nn.Module):
+    def __init__(self, num_channels: int, in_channels: int, out_channels: int, kernel_size: int, bias: bool):
+        super().__init__()
+        self.norm = torch.nn.GroupNorm(num_groups=1, num_channels=num_channels, eps=1e-8)
+        self.conv1d = torch.nn.Conv1d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, bias=bias)
+    def forward(self, x: torch.Tensor):
+        x = self.norm(x)
+        x = self.conv1d(x)
+        return x
+class Separator(torch.nn.Module):
+    def __init__(self, num_stages: int, relative_positional_encoding: dict, enc_stage: dict, spk_split_stage: dict, simple_fusion:dict, dec_stage: dict):
+        super().__init__()
+        class RelativePositionalEncoding(torch.nn.Module):
+            def __init__(self, in_channels: int, num_heads: int, maxlen: int, embed_v=False):
+                super().__init__()
+                self.in_channels = in_channels
+                self.num_heads = num_heads
+                self.embedding_dim = self.in_channels // self.num_heads
+                self.maxlen = maxlen
+                self.pe_k = torch.nn.Embedding(num_embeddings=2*maxlen, embedding_dim=self.embedding_dim)
+                self.pe_v = torch.nn.Embedding(num_embeddings=2*maxlen, embedding_dim=self.embedding_dim) if embed_v else None
+            def forward(self, pos_seq: torch.Tensor):
+                pos_seq.clamp_(-self.maxlen, self.maxlen - 1)
+                pos_seq += self.maxlen
+                pe_k_output = self.pe_k(pos_seq)
+                pe_v_output = self.pe_v(pos_seq) if self.pe_v is not None else None
+                return pe_k_output, pe_v_output
+        class SepEncStage(torch.nn.Module):
+            def __init__(self, global_blocks: dict, local_blocks: dict, down_conv_layer: dict, down_conv=True):
+                super().__init__()
+                class DownConvLayer(torch.nn.Module):
+                    def __init__(self, in_channels: int, samp_kernel_size: int):
+                        """Construct an EncoderLayer object."""
+                        super().__init__()
+                        self.down_conv = torch.nn.Conv1d(
+                            in_channels=in_channels, out_channels=in_channels, kernel_size=samp_kernel_size, stride=2, padding=(samp_kernel_size-1)//2, groups=in_channels)
+                        self.BN = torch.nn.BatchNorm1d(num_features=in_channels)
+                        self.gelu = torch.nn.GELU()
+                    def forward(self, x: torch.Tensor):
+                        x = x.permute([0, 2, 1])
+                        x = self.down_conv(x)
+                        x = self.BN(x)
+                        x = self.gelu(x)
+                        x = x.permute([0, 2, 1])
+                        return x
+                self.g_block_1 = GlobalBlock(**global_blocks)
+                self.l_block_1 = LocalBlock(**local_blocks)
+                self.g_block_2 = GlobalBlock(**global_blocks)
+                self.l_block_2 = LocalBlock(**local_blocks)
+                self.downconv = DownConvLayer(**down_conv_layer) if down_conv == True else None
+            def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+                '''
+                x: [B, N, T]
+                '''
+                x = self.g_block_1(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_1(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.g_block_2(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_2(x)
+                x = x.permute(0, 2, 1).contiguous()
+                skip = x
+                if self.downconv:
+                    x = x.permute(0, 2, 1).contiguous()
+                    x = self.downconv(x)
+                    x = x.permute(0, 2, 1).contiguous()
+                # [BK, S, N]
+                return x, skip
+        class SpkSplitStage(torch.nn.Module):
+            def __init__(self, in_channels: int, num_spks: int):
+                super().__init__()
+                self.linear = torch.nn.Sequential(
+                    torch.nn.Conv1d(in_channels, 4*in_channels*num_spks, kernel_size=1),
+                    torch.nn.GLU(dim=-2),
+                    torch.nn.Conv1d(2*in_channels*num_spks, in_channels*num_spks, kernel_size=1))
+                self.norm = torch.nn.GroupNorm(1, in_channels, eps=1e-8)
+                self.num_spks = num_spks
+            def forward(self, x: torch.Tensor):
+                x = self.linear(x)
+                B, _, T = x.shape
+                x = x.view(B*self.num_spks,-1, T).contiguous()
+                x = self.norm(x)
+                return x
+        class SepDecStage(torch.nn.Module):
+            def __init__(self, num_spks: int, global_blocks: dict, local_blocks: dict, spk_attention: dict):
+                super().__init__()
+                self.g_block_1 = GlobalBlock(**global_blocks)
+                self.l_block_1 = LocalBlock(**local_blocks)
+                self.spk_attn_1 = SpkAttention(**spk_attention)
+                self.g_block_2 = GlobalBlock(**global_blocks)
+                self.l_block_2 = LocalBlock(**local_blocks)
+                self.spk_attn_2 = SpkAttention(**spk_attention)
+                self.g_block_3 = GlobalBlock(**global_blocks)
+                self.l_block_3 = LocalBlock(**local_blocks)
+                self.spk_attn_3 = SpkAttention(**spk_attention)
+                self.num_spk = num_spks
+            def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+                '''
+                x: [B, N, T]
+                '''
+                # [BS, K, H]
+                x = self.g_block_1(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_1(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.spk_attn_1(x, self.num_spk)
+                x = self.g_block_2(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_2(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.spk_attn_2(x, self.num_spk)
+                x = self.g_block_3(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_3(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.spk_attn_3(x, self.num_spk)
+                skip = x
+                return x, skip
+        self.num_stages = num_stages
+        self.pos_emb = RelativePositionalEncoding(**relative_positional_encoding)
+        # Temporal Contracting Part
+        self.enc_stages = torch.nn.ModuleList([])
+        for _ in range(self.num_stages):
+            self.enc_stages.append(SepEncStage(**enc_stage, down_conv=True))
+        self.bottleneck_G = SepEncStage(**enc_stage, down_conv=False)
+        self.spk_split_block = SpkSplitStage(**spk_split_stage)
+        # Temporal Expanding Part
+        self.simple_fusion = torch.nn.ModuleList([])
+        self.dec_stages = torch.nn.ModuleList([])
+        for _ in range(self.num_stages):
+            self.simple_fusion.append(torch.nn.Conv1d(in_channels=simple_fusion['out_channels']*2,out_channels=simple_fusion['out_channels'], kernel_size=1))
+            self.dec_stages.append(SepDecStage(**dec_stage))
+    def forward(self, input: torch.Tensor):
+        '''input: [B, N, L]'''
+        # feature projection
+        x, _ = self.pad_signal(input)
+        len_x = x.shape[-1]
+        # Temporal Contracting Part
+        pos_seq = torch.arange(0, len_x//2**self.num_stages).long().to(x.device)
+        pos_seq = pos_seq[:, None] - pos_seq[None, :]
+        pos_k, _ = self.pos_emb(pos_seq)
+        skip = []
+        for idx in range(self.num_stages):
+            x, skip_ = self.enc_stages[idx](x, pos_k)
+            skip_ = self.spk_split_block(skip_)
+            skip.append(skip_)
+        x, _ = self.bottleneck_G(x, pos_k)
+        x = self.spk_split_block(x) # B, 2F, T
+        each_stage_outputs = []
+        # Temporal Expanding Part
+        for idx in range(self.num_stages):
+            each_stage_outputs.append(x)
+            idx_en = self.num_stages - (idx + 1)
+            x = torch.nn.functional.upsample(x, skip[idx_en].shape[-1])
+            x = torch.cat([x,skip[idx_en]],dim=1)
+            x = self.simple_fusion[idx](x)
+            x, _ = self.dec_stages[idx](x, pos_k)
+        last_stage_output = x
+        return last_stage_output, each_stage_outputs
+    def pad_signal(self, input: torch.Tensor):
+        #  (B, T) or (B, 1, T)
+        if input.dim() == 1: input = input.unsqueeze(0)
+        elif input.dim() not in [2, 3]: raise RuntimeError("Input can only be 2 or 3 dimensional.")
+        elif input.dim() == 2: input = input.unsqueeze(1)
+        L = 2**self.num_stages
+        batch_size = input.size(0)
+        ndim = input.size(1)
+        nframe = input.size(2)
+        padded_len = (nframe//L + 1)*L
+        rest = 0 if nframe%L == 0 else padded_len - nframe
+        if rest > 0:
+            pad = torch.autograd.Variable(torch.zeros(batch_size, ndim, rest)).type(input.type()).to(input.device)
+            input = torch.cat([input, pad], dim=-1)
+        return input, rest
+class OutputLayer(torch.nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, num_spks: int, masking: bool = False):
+        super().__init__()
+        # feature expansion back
+        self.masking = masking
+        self.spe_block = Masking(in_channels, Activation_mask="ReLU", concat_opt=None)
+        self.num_spks = num_spks
+        self.end_conv1x1 = torch.nn.Sequential(
+            torch.nn.Linear(out_channels, 4*out_channels),
+            torch.nn.GLU(),
+            torch.nn.Linear(2*out_channels, in_channels))
+    def forward(self, x: torch.Tensor, input: torch.Tensor):
+        x = x[...,:input.shape[-1]]
+        x = x.permute([0, 2, 1])
+        x = self.end_conv1x1(x)
+        x = x.permute([0, 2, 1])
+        B, N, L = x.shape
+        B = B // self.num_spks
+        if self.masking:
+            input = input.expand(self.num_spks, B, N, L).transpose(0,1).contiguous()
+            input = input.view(B*self.num_spks, N, L)
+            x = self.spe_block(x, input)
+        x = x.view(B, self.num_spks, N, L)
+        # [spks, B, N, L]
+        x = x.transpose(0, 1)
+        return x
+class AudioDecoder(torch.nn.ConvTranspose1d):
+    '''
+        Decoder of the TasNet
+        This module can be seen as the gradient of Conv1d with respect to its input.
+        It is also known as a fractionally-strided convolution
+        or a deconvolution (although it is not an actual deconvolution operation).
+    '''
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x):
+        # x: [B, N, L]
+        if x.dim() not in [2, 3]: raise RuntimeError("{} accept 3/4D tensor as input".format(self.__name__))
+        x = super().forward(x if x.dim() == 3 else torch.unsqueeze(x, 1))
+        x = torch.squeeze(x, dim=1) if torch.squeeze(x).dim() == 1 else torch.squeeze(x)
+        return x

models/SepReformer/SepReformer_Large_DM_WHAMR/modules/network.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import torch
+import math
+import numpy
+from utils.decorators import *
+class LayerScale(torch.nn.Module):
+    def __init__(self, dims, input_size, Layer_scale_init=1.0e-5):
+        super().__init__()
+        if dims == 1:
+            self.layer_scale = torch.nn.Parameter(torch.ones(input_size)*Layer_scale_init, requires_grad=True)
+        elif dims == 2:
+            self.layer_scale = torch.nn.Parameter(torch.ones(1,input_size)*Layer_scale_init, requires_grad=True)
+        elif dims == 3:
+            self.layer_scale = torch.nn.Parameter(torch.ones(1,1,input_size)*Layer_scale_init, requires_grad=True)
+    def forward(self, x):
+        return x*self.layer_scale
+class Masking(torch.nn.Module):
+    def __init__(self, input_dim, Activation_mask='Sigmoid', **options):
+        super(Masking, self).__init__()
+        self.options = options
+        if self.options['concat_opt']:
+            self.pw_conv = torch.nn.Conv1d(input_dim*2, input_dim, 1, stride=1, padding=0)
+        if Activation_mask == 'Sigmoid':
+            self.gate_act = torch.nn.Sigmoid()
+        elif Activation_mask == 'ReLU':
+            self.gate_act = torch.nn.ReLU()
+    def forward(self, x, skip):
+        if self.options['concat_opt']:
+            y = torch.cat([x, skip], dim=-2)
+            y = self.pw_conv(y)
+        else:
+            y = x
+        y = self.gate_act(y) * skip
+        return y
+class GCFN(torch.nn.Module):
+    def __init__(self, in_channels, dropout_rate, Layer_scale_init=1.0e-5):
+        super().__init__()
+        self.net1 = torch.nn.Sequential(
+            torch.nn.LayerNorm(in_channels),
+            torch.nn.Linear(in_channels, in_channels*6))
+        self.depthwise = torch.nn.Conv1d(in_channels*6, in_channels*6, 3, padding=1, groups=in_channels*6)
+        self.net2 = torch.nn.Sequential(
+            torch.nn.GLU(),
+            torch.nn.Dropout(dropout_rate),
+            torch.nn.Linear(in_channels*3, in_channels),
+            torch.nn.Dropout(dropout_rate))
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x):
+        y = self.net1(x)
+        y = y.permute(0, 2, 1).contiguous()
+        y = self.depthwise(y)
+        y = y.permute(0, 2, 1).contiguous()
+        y = self.net2(y)
+        return x + self.Layer_scale(y)
+class MultiHeadAttention(torch.nn.Module):
+    """
+    Multi-Head Attention layer.
+        :param int n_head: the number of head s
+        :param int n_feat: the number of features
+        :param float dropout_rate: dropout rate
+    """
+    def __init__(self, n_head: int, in_channels: int, dropout_rate: float, Layer_scale_init=1.0e-5):
+        super().__init__()
+        assert in_channels % n_head == 0
+        self.d_k = in_channels // n_head # We assume d_v always equals d_k
+        self.h = n_head
+        self.layer_norm = torch.nn.LayerNorm(in_channels)
+        self.linear_q = torch.nn.Linear(in_channels, in_channels)
+        self.linear_k = torch.nn.Linear(in_channels, in_channels)
+        self.linear_v = torch.nn.Linear(in_channels, in_channels)
+        self.linear_out = torch.nn.Linear(in_channels, in_channels)
+        self.attn = None
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x, pos_k, mask):
+        """
+        Compute 'Scaled Dot Product Attention'.
+            :param torch.Tensor mask: (batch, time1, time2)
+            :param torch.nn.Dropout dropout:
+            :return torch.Tensor: attentined and transformed `value` (batch, time1, d_model)
+            weighted by the query dot key attention (batch, head, time1, time2)
+        """
+        n_batch = x.size(0)
+        x = self.layer_norm(x)
+        q = self.linear_q(x).view(n_batch, -1, self.h, self.d_k)  #(b, t, d)
+        k = self.linear_k(x).view(n_batch, -1, self.h, self.d_k)  #(b, t, d)
+        v = self.linear_v(x).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        A = torch.matmul(q, k.transpose(-2, -1))
+        reshape_q = q.contiguous().view(n_batch * self.h, -1, self.d_k).transpose(0,1)
+        if pos_k is not None:
+            B = torch.matmul(reshape_q, pos_k.transpose(-2, -1))
+            B = B.transpose(0, 1).view(n_batch, self.h, pos_k.size(0), pos_k.size(1))
+            scores = (A + B) / math.sqrt(self.d_k)
+        else:
+            scores = A / math.sqrt(self.d_k)
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
+            min_value = float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
+            scores = scores.masked_fill(mask, min_value)
+            self.attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0)  # (batch, head, time1, time2)
+        else:
+            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(self.attn)
+        x = torch.matmul(p_attn, v)  # (batch, head, time1, d_k)
+        x = x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+        return self.Layer_scale(self.dropout(self.linear_out(x)))  # (batch, time1, d_model)
+class EGA(torch.nn.Module):
+    def __init__(self, in_channels: int, num_mha_heads: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'self_attn': MultiHeadAttention(
+                n_head=num_mha_heads, in_channels=in_channels, dropout_rate=dropout_rate),
+            'linear': torch.nn.Sequential(
+                torch.nn.LayerNorm(normalized_shape=in_channels),
+                torch.nn.Linear(in_features=in_channels, out_features=in_channels),
+                torch.nn.Sigmoid())
+        })
+    def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+        """
+        Compute encoded features.
+            :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+            :param torch.Tensor mask: mask for x (batch, max_time_in)
+            :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        down_len = pos_k.shape[0]
+        x_down = torch.nn.functional.adaptive_avg_pool1d(input=x, output_size=down_len)
+        x = x.permute([0, 2, 1])
+        x_down = x_down.permute([0, 2, 1])
+        x_down = self.block['self_attn'](x_down, pos_k, None)
+        x_down = x_down.permute([0, 2, 1])
+        x_downup = torch.nn.functional.upsample(input=x_down, size=x.shape[1])
+        x_downup = x_downup.permute([0, 2, 1])
+        x = x + self.block['linear'](x) * x_downup
+        return x
+class CLA(torch.nn.Module):
+    def __init__(self, in_channels, kernel_size, dropout_rate, Layer_scale_init=1.0e-5):
+        super().__init__()
+        self.layer_norm = torch.nn.LayerNorm(in_channels)
+        self.linear1 = torch.nn.Linear(in_channels, in_channels*2)
+        self.GLU = torch.nn.GLU()
+        self.dw_conv_1d = torch.nn.Conv1d(in_channels, in_channels, kernel_size, padding='same', groups=in_channels)
+        self.linear2 = torch.nn.Linear(in_channels, 2*in_channels)
+        self.BN = torch.nn.BatchNorm1d(2*in_channels)
+        self.linear3 = torch.nn.Sequential(
+            torch.nn.GELU(),
+            torch.nn.Linear(2*in_channels, in_channels),
+            torch.nn.Dropout(dropout_rate))
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x):
+        y = self.layer_norm(x)
+        y = self.linear1(y)
+        y = self.GLU(y)
+        y = y.permute([0, 2, 1]) # B, F, T
+        y = self.dw_conv_1d(y)
+        y = y.permute(0, 2, 1) # B, T, 2F
+        y = self.linear2(y)
+        y = y.permute(0, 2, 1) # B, T, 2F
+        y = self.BN(y)
+        y = y.permute(0, 2, 1) # B, T, 2F
+        y = self.linear3(y)
+        return x + self.Layer_scale(y)
+class GlobalBlock(torch.nn.Module):
+    def __init__(self, in_channels: int, num_mha_heads: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'ega': EGA(
+                num_mha_heads=num_mha_heads, in_channels=in_channels, dropout_rate=dropout_rate),
+            'gcfn': GCFN(in_channels=in_channels, dropout_rate=dropout_rate)
+        })
+    def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+        """
+        Compute encoded features.
+            :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+            :param torch.Tensor mask: mask for x (batch, max_time_in)
+            :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        x = self.block['ega'](x, pos_k)
+        x = self.block['gcfn'](x)
+        x = x.permute([0, 2, 1])
+        return x
+class LocalBlock(torch.nn.Module):
+    def __init__(self, in_channels: int, kernel_size: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'cla': CLA(in_channels, kernel_size, dropout_rate),
+            'gcfn': GCFN(in_channels, dropout_rate)
+        })
+    def forward(self, x: torch.Tensor):
+        x = self.block['cla'](x)
+        x = self.block['gcfn'](x)
+        return x
+class SpkAttention(torch.nn.Module):
+    def __init__(self, in_channels: int, num_mha_heads: int, dropout_rate: float):
+        super().__init__()
+        self.self_attn = MultiHeadAttention(n_head=num_mha_heads, in_channels=in_channels, dropout_rate=dropout_rate)
+        self.feed_forward = GCFN(in_channels=in_channels, dropout_rate=dropout_rate)
+    def forward(self, x: torch.Tensor, num_spk: int):
+        """
+        Compute encoded features.
+            :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+            :param torch.Tensor mask: mask for x (batch, max_time_in)
+            :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        B, F, T = x.shape
+        x = x.view(B//num_spk, num_spk, F, T).contiguous()
+        x = x.permute([0, 3, 1, 2]).contiguous()
+        x = x.view(-1, num_spk, F).contiguous()
+        x = x + self.self_attn(x, None, None)
+        x = x.view(B//num_spk, T, num_spk, F).contiguous()
+        x = x.permute([0, 2, 3, 1]).contiguous()
+        x = x.view(B, F, T).contiguous()
+        x = x.permute([0, 2, 1])
+        x = self.feed_forward(x)
+        x = x.permute([0, 2, 1])
+        return x

models/SepReformer/SepReformer_Large_DM_WSJ0/configs.yaml ADDED Viewed

	@@ -0,0 +1,128 @@

+config:
+    dataset:
+        max_len : 32000
+        sampling_rate: 8000
+        scp_dir: "data/scp_ss_8k"
+        train:
+            mixture: "tr_mix.scp"
+            spk1: "tr_s1.scp"
+            spk2: "tr_s2.scp"
+            dynamic_mixing: true
+        valid:
+            mixture: "cv_mix.scp"
+            spk1: "cv_s1.scp"
+            spk2: "cv_s2.scp"
+        test:
+            mixture: "tt_mix.scp"
+            spk1: "tt_s1.scp"
+            spk2: "tt_s2.scp"
+    dataloader:
+        batch_size: 2
+        pin_memory: false
+        num_workers: 12
+        drop_last: false
+    model:
+        num_stages: &var_model_num_stages 4 # R
+        num_spks: &var_model_num_spks 2
+        module_audio_enc:
+            in_channels: 1
+            out_channels: &var_model_audio_enc_out_channels 256
+            kernel_size: &var_model_audio_enc_kernel_size 16 # L
+            stride: &var_model_audio_enc_stride 4 # S
+            groups: 1
+            bias: false
+        module_feature_projector:
+            num_channels: *var_model_audio_enc_out_channels
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: &feature_projector_out_channels 256 # F
+            kernel_size: 1
+            bias: false
+        module_separator:
+            num_stages: *var_model_num_stages
+            relative_positional_encoding:
+                in_channels: *feature_projector_out_channels
+                num_heads: 8
+                maxlen: 2000
+                embed_v: false
+            enc_stage:
+                global_blocks:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.1
+                local_blocks:
+                    in_channels: *feature_projector_out_channels
+                    kernel_size: 65
+                    dropout_rate: 0.1
+                down_conv_layer:
+                    in_channels: *feature_projector_out_channels
+                    samp_kernel_size: &var_model_samp_kernel_size 5
+            spk_split_stage:
+                in_channels: *feature_projector_out_channels
+                num_spks: *var_model_num_spks
+            simple_fusion:
+                out_channels: *feature_projector_out_channels
+            dec_stage:
+                num_spks: *var_model_num_spks
+                global_blocks:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.1
+                local_blocks:
+                    in_channels: *feature_projector_out_channels
+                    kernel_size: 65
+                    dropout_rate: 0.1
+                spk_attention:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.1
+        module_output_layer:
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: *feature_projector_out_channels
+            num_spks: *var_model_num_spks
+        module_audio_dec:
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: 1
+            kernel_size: *var_model_audio_enc_kernel_size
+            stride: *var_model_audio_enc_stride
+            bias: false
+    criterion: ### Ref: https://pytorch.org/docs/stable/nn.html#loss-functions
+        name: ["PIT_SISNR_mag", "PIT_SISNR_time", "PIT_SISNRi", "PIT_SDRi"] ### Choose a torch.nn's loss function class(=attribute) e.g. ["L1Loss", "MSELoss", "CrossEntropyLoss", ...] / You can also build your optimizer :)
+        PIT_SISNR_mag:
+            frame_length: 512
+            frame_shift: 128
+            window: 'hann'
+            num_stages: *var_model_num_stages
+            num_spks: *var_model_num_spks
+            scale_inv: true
+            mel_opt: false
+        PIT_SISNR_time:
+            num_spks: *var_model_num_spks
+            scale_inv: true
+        PIT_SISNRi:
+            num_spks: *var_model_num_spks
+            scale_inv: true
+        PIT_SDRi:
+            dump: 0
+    optimizer: ### Ref: https://pytorch.org/docs/stable/optim.html#algorithms
+        name: ["AdamW"] ### Choose a torch.optim's class(=attribute) e.g. ["Adam", "AdamW", "SGD", ...] / You can also build your optimizer :)
+        AdamW:
+            lr: 2.0e-4
+            weight_decay: 1.0e-2
+    scheduler: ### Ref(+ find "How to adjust learning rate"): https://pytorch.org/docs/stable/optim.html#algorithms
+        name: ["ReduceLROnPlateau", "WarmupConstantSchedule"] ### Choose a torch.optim.lr_scheduler's class(=attribute) e.g. ["StepLR", "ReduceLROnPlateau", "Custom"] / You can also build your scheduler :)
+        ReduceLROnPlateau:
+            mode: "min"
+            min_lr: 1.0e-10
+            factor: 0.8
+            patience: 2
+        WarmupConstantSchedule:
+            warmup_steps: 1000
+    check_computations:
+        dummy_len: 16000
+    engine:
+        max_epoch: 200
+        gpuid: "0" ### "0"(single-gpu) or "0, 1" (multi-gpu)
+        mvn: false
+        clip_norm: 5
+        start_scheduling: 50
+        test_epochs: [50, 80, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 199]

models/SepReformer/SepReformer_Large_DM_WSJ0/dataset.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import os
+import torch
+import random
+import librosa as audio_lib
+import numpy as np
+from utils import util_dataset
+from utils.decorators import *
+from loguru import logger
+from torch.utils.data import Dataset, DataLoader
+@logger_wraps()
+def get_dataloaders(args, dataset_config, loader_config):
+    # create dataset object for each partition
+    partitions = ["test"] if "test" in args.engine_mode  else ["train", "valid", "test"]
+    dataloaders = {}
+    for partition in partitions:
+        scp_config_mix = os.path.join(dataset_config["scp_dir"], dataset_config[partition]['mixture'])
+        scp_config_spk = [os.path.join(dataset_config["scp_dir"], dataset_config[partition][spk_key]) for spk_key in dataset_config[partition] if spk_key.startswith('spk')]
+        dynamic_mixing = dataset_config[partition]["dynamic_mixing"] if partition == 'train' else False
+        dataset = MyDataset(
+            max_len = dataset_config['max_len'],
+            fs = dataset_config['sampling_rate'],
+            partition = partition,
+            wave_scp_srcs = scp_config_spk,
+            wave_scp_mix = scp_config_mix,
+            dynamic_mixing = dynamic_mixing)
+        dataloader = DataLoader(
+            dataset = dataset,
+            batch_size = 1 if partition == 'test' else loader_config["batch_size"],
+            shuffle = True, # only train: (partition == 'train') / all: True
+            pin_memory = loader_config["pin_memory"],
+            num_workers = loader_config["num_workers"],
+            drop_last = loader_config["drop_last"],
+            collate_fn = _collate)
+        dataloaders[partition] = dataloader
+    return dataloaders
+def _collate(egs):
+    """
+        Transform utterance index into a minbatch
+        Arguments:
+            index: a list type [{},{},{}]
+        Returns:
+            input_sizes: a tensor correspond to utterance length
+            input_feats: packed sequence to feed networks
+            source_attr/target_attr: dictionary contains spectrogram/phase needed in loss computation
+    """
+    def __prepare_target_rir(dict_lsit, index):
+        return torch.nn.utils.rnn.pad_sequence([torch.tensor(d["src"][index], dtype=torch.float32)  for d in dict_lsit], batch_first=True)
+    if type(egs) is not list: raise ValueError("Unsupported index type({})".format(type(egs)))
+    num_spks = 2 # you need to set this paramater by yourself
+    dict_list = sorted([eg for eg in egs], key=lambda x: x['num_sample'], reverse=True)
+    mixture = torch.nn.utils.rnn.pad_sequence([torch.tensor(d['mix'], dtype=torch.float32) for d in dict_list], batch_first=True)
+    src = [__prepare_target_rir(dict_list, index) for index in range(num_spks)]
+    input_sizes = torch.tensor([d['num_sample'] for d in dict_list], dtype=torch.float32)
+    key = [d['key'] for d in dict_list]
+    return input_sizes, mixture, src, key
+@logger_wraps()
+class MyDataset(Dataset):
+    def __init__(self, max_len, fs, partition, wave_scp_srcs, wave_scp_mix, dynamic_mixing, speed_list=None):
+        self.partition = partition
+        for wave_scp_src in wave_scp_srcs:
+            if not os.path.exists(wave_scp_src): raise FileNotFoundError(f"Could not find file {wave_scp_src}")
+        self.max_len = max_len
+        self.fs = fs
+        self.wave_dict_srcs = [util_dataset.parse_scps(wave_scp_src) for wave_scp_src in wave_scp_srcs]
+        self.wave_dict_mix = util_dataset.parse_scps(wave_scp_mix)
+        self.wave_keys = list(self.wave_dict_mix.keys())
+        logger.info(f"Create MyDataset for {wave_scp_mix} with {len(self.wave_dict_mix)} utterances")
+        self.dynamic_mixing = dynamic_mixing
+    def __len__(self):
+        return len(self.wave_dict_mix)
+    def __contains__(self, key):
+        return key in self.wave_dict_mix
+    def _dynamic_mixing(self, key):
+        def __match_length(wav, len_data) :
+            leftover = len(wav) - len_data
+            idx = random.randint(0,leftover)
+            wav = wav[idx:idx+len_data]
+            return wav
+        samps_src = []
+        src_len = []
+        # dyanmic source choice
+        # checking whether it is the same speaker
+        while True:
+            key_random = random.choice(list(self.wave_dict_srcs[0].keys()))
+            tmp1 = key.split('_')[1][:3] != key_random.split('_')[3][:3]
+            tmp2 = key.split('_')[3][:3] != key_random.split('_')[1][:3]
+            if tmp1 and tmp2: break
+        idx1, idx2 = (0, 1) if random.random() > 0.5 else (1, 0)
+        files = [self.wave_dict_srcs[idx1][key], self.wave_dict_srcs[idx2][key_random]]
+        # load
+        for idx, file in enumerate(files):
+            if not os.path.exists(file): raise FileNotFoundError("Input file {} do not exists!".format(file))
+            samps_tmp, _ = audio_lib.load(file, sr=self.fs)
+            if idx == 0: ref_rms = np.sqrt(np.mean(np.square(samps_tmp)))
+            curr_rms = np.sqrt(np.mean(np.square(samps_tmp)))
+            norm_factor = ref_rms / curr_rms
+            samps_tmp *= norm_factor
+            # mixing with random gains
+            gain = pow(10,-random.uniform(-5,5)/20)
+            samps_tmp = np.array(torch.tensor(samps_tmp))
+            samps_src.append(gain*samps_tmp)
+            src_len.append(len(samps_tmp))
+        # matching the audio length
+        min_len = min(src_len)
+        # add noise source dynamically if needed
+        samps_src = [__match_length(s, min_len) for s in samps_src]
+        samps_mix = sum(samps_src)
+        # ! truncated along to the sample Length "L"
+        if len(samps_mix)%4 != 0:
+            remains = len(samps_mix)%4
+            samps_mix = samps_mix[:-remains]
+            samps_src = [s[:-remains] for s in samps_src]
+        if self.partition != "test":
+            if len(samps_mix) > self.max_len:
+                start = random.randint(0, len(samps_mix)-self.max_len)
+                samps_mix = samps_mix[start:start+self.max_len]
+                samps_src = [s[start:start+self.max_len] for s in samps_src]
+        return samps_mix, samps_src
+    def _direct_load(self, key):
+        samps_src = []
+        files = [wave_dict_src[key] for wave_dict_src in self.wave_dict_srcs]
+        for file in files:
+            if not os.path.exists(file): raise FileNotFoundError(f"Input file {file} do not exists!")
+            samps_tmp, _ = audio_lib.load(file, sr=self.fs)
+            samps_src.append(samps_tmp)
+        file = self.wave_dict_mix[key]
+        if not os.path.exists(file): raise FileNotFoundError(f"Input file {file} do not exists!")
+        samps_mix, _ = audio_lib.load(file, sr=self.fs)
+        # Truncate samples as needed
+        if len(samps_mix) % 4 != 0:
+            remains = len(samps_mix) % 4
+            samps_mix = samps_mix[:-remains]
+            samps_src = [s[:-remains] for s in samps_src]
+        if self.partition != "test":
+            if len(samps_mix) > self.max_len:
+                start = random.randint(0,len(samps_mix)-self.max_len)
+                samps_mix = samps_mix[start:start+self.max_len]
+                samps_src = [s[start:start+self.max_len] for s in samps_src]
+        return samps_mix, samps_src
+    def __getitem__(self, index):
+        key = self.wave_keys[index]
+        if any(key not in self.wave_dict_srcs[i] for i in range(len(self.wave_dict_srcs))) or key not in self.wave_dict_mix: raise KeyError(f"Could not find utterance {key}")
+        samps_mix, samps_src = self._dynamic_mixing(key) if self.dynamic_mixing else self._direct_load(key)
+        return {"num_sample": samps_mix.shape[0], "mix": samps_mix, "src": samps_src, "key": key}

models/SepReformer/SepReformer_Large_DM_WSJ0/engine.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import os
+import torch
+import csv
+import time
+import soundfile as sf
+from loguru import logger
+from tqdm import tqdm
+from utils import util_engine, functions
+from utils.decorators import *
+from torch.utils.tensorboard import SummaryWriter
+@logger_wraps()
+class Engine(object):
+    def __init__(self, args, config, model, dataloaders, criterions, optimizers, schedulers, gpuid, device):
+        ''' Default setting '''
+        self.engine_mode = args.engine_mode
+        self.out_wav_dir = args.out_wav_dir
+        self.config = config
+        self.gpuid = gpuid
+        self.device = device
+        self.model = model.to(self.device)
+        self.dataloaders = dataloaders # self.dataloaders['train'] or ['valid'] or ['test']
+        self.PIT_SISNR_mag_loss, self.PIT_SISNR_time_loss, self.PIT_SISNRi_loss, self.PIT_SDRi_loss = criterions
+        self.main_optimizer = optimizers[0]
+        self.main_scheduler, self.warmup_scheduler = schedulers
+        self.pretrain_weights_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log", "pretrain_weights")
+        os.makedirs(self.pretrain_weights_path, exist_ok=True)
+        self.scratch_weights_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log", "scratch_weights")
+        os.makedirs(self.scratch_weights_path, exist_ok=True)
+        self.checkpoint_path = self.pretrain_weights_path if any(file.endswith(('.pt', '.pt', '.pkl')) for file in os.listdir(self.pretrain_weights_path)) else self.scratch_weights_path
+        self.start_epoch = util_engine.load_last_checkpoint_n_get_epoch(self.checkpoint_path, self.model, self.main_optimizer, location=self.device)
+        # Logging
+        util_engine.model_params_mac_summary(
+            model=self.model,
+            input=torch.randn(1, self.config['check_computations']['dummy_len']).to(self.device),
+            dummy_input=torch.rand(1, self.config['check_computations']['dummy_len']).to(self.device),
+            metrics=['ptflops', 'thop', 'torchinfo']
+            # metrics=['ptflops']
+        )
+        logger.info(f"Clip gradient by 2-norm {self.config['engine']['clip_norm']}")
+    @logger_wraps()
+    def _train(self, dataloader, epoch):
+        self.model.train()
+        tot_loss_freq = [0 for _ in range(self.model.num_stages)]
+        tot_loss_time, num_batch = 0, 0
+        pbar = tqdm(total=len(dataloader), unit='batches', bar_format='{l_bar}{bar:25}{r_bar}{bar:-10b}', colour="YELLOW", dynamic_ncols=True)
+        for input_sizes, mixture, src, _ in dataloader:
+            nnet_input = mixture
+            nnet_input = functions.apply_cmvn(nnet_input) if self.config['engine']['mvn'] else nnet_input
+            num_batch += 1
+            pbar.update(1)
+            # Scheduler learning rate for warm-up (Iteration-based update for transformers)
+            if epoch == 1: self.warmup_scheduler.step()
+            nnet_input = nnet_input.to(self.device)
+            self.main_optimizer.zero_grad()
+            estim_src, estim_src_bn = torch.nn.parallel.data_parallel(self.model, nnet_input, device_ids=self.gpuid)
+            cur_loss_s_bn = 0
+            cur_loss_s_bn = []
+            for idx, estim_src_value in enumerate(estim_src_bn):
+                cur_loss_s_bn.append(self.PIT_SISNR_mag_loss(estims=estim_src_value, idx=idx, input_sizes=input_sizes, target_attr=src))
+                tot_loss_freq[idx] += cur_loss_s_bn[idx].item() / (self.config['model']['num_spks'])
+            cur_loss_s = self.PIT_SISNR_time_loss(estims=estim_src, input_sizes=input_sizes, target_attr=src)
+            tot_loss_time += cur_loss_s.item() / self.config['model']['num_spks']
+            alpha = 0.4 * 0.8**(1+(epoch-101)//5) if epoch > 100 else 0.4
+            cur_loss = (1-alpha) * cur_loss_s + alpha * sum(cur_loss_s_bn) / len(cur_loss_s_bn)
+            cur_loss = cur_loss / self.config['model']['num_spks']
+            cur_loss.backward()
+            if self.config['engine']['clip_norm']: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config['engine']['clip_norm'])
+            self.main_optimizer.step()
+            dict_loss = {"T_Loss": tot_loss_time / num_batch}
+            dict_loss.update({'F_Loss_' + str(idx): loss / num_batch for idx, loss in enumerate(tot_loss_freq)})
+            pbar.set_postfix(dict_loss)
+        pbar.close()
+        tot_loss_freq = sum(tot_loss_freq) / len(tot_loss_freq)
+        return tot_loss_time / num_batch, tot_loss_freq / num_batch, num_batch
+    @logger_wraps()
+    def _validate(self, dataloader):
+        self.model.eval()
+        tot_loss_freq = [0 for _ in range(self.model.num_stages)]
+        tot_loss_time, num_batch = 0, 0
+        pbar = tqdm(total=len(dataloader), unit='batches', bar_format='{l_bar}{bar:5}{r_bar}{bar:-10b}', colour="RED", dynamic_ncols=True)
+        with torch.inference_mode():
+            for input_sizes, mixture, src, _ in dataloader:
+                nnet_input = mixture
+                nnet_input = functions.apply_cmvn(nnet_input) if self.config['engine']['mvn'] else nnet_input
+                nnet_input = nnet_input.to(self.device)
+                num_batch += 1
+                pbar.update(1)
+                estim_src, estim_src_bn = torch.nn.parallel.data_parallel(self.model, nnet_input, device_ids=self.gpuid)
+                cur_loss_s_bn = []
+                for idx, estim_src_value in enumerate(estim_src_bn):
+                    cur_loss_s_bn.append(self.PIT_SISNR_mag_loss(estims=estim_src_value, idx=idx, input_sizes=input_sizes, target_attr=src))
+                    tot_loss_freq[idx] += cur_loss_s_bn[idx].item() / (self.config['model']['num_spks'])
+                cur_loss_s_SDR = self.PIT_SISNR_time_loss(estims=estim_src, input_sizes=input_sizes, target_attr=src)
+                tot_loss_time += cur_loss_s_SDR.item() / self.config['model']['num_spks']
+                dict_loss = {"T_Loss":tot_loss_time / num_batch}
+                dict_loss.update({'F_Loss_' + str(idx): loss / num_batch for idx, loss in enumerate(tot_loss_freq)})
+                pbar.set_postfix(dict_loss)
+        pbar.close()
+        tot_loss_freq = sum(tot_loss_freq) / len(tot_loss_freq)
+        return tot_loss_time / num_batch, tot_loss_freq / num_batch, num_batch
+    @logger_wraps()
+    def _test(self, dataloader, wav_dir=None):
+        self.model.eval()
+        total_loss_SISNRi, total_loss_SDRi, num_batch = 0, 0, 0
+        pbar = tqdm(total=len(dataloader), unit='batches', bar_format='{l_bar}{bar:5}{r_bar}{bar:-10b}', colour="grey", dynamic_ncols=True)
+        with torch.inference_mode():
+            csv_file_name_sisnr = os.path.join(os.path.dirname(__file__),'test_SISNRi_value.csv')
+            csv_file_name_sdr = os.path.join(os.path.dirname(__file__),'test_SDRi_value.csv')
+            with open(csv_file_name_sisnr, 'w', newline='') as csvfile_sisnr, open(csv_file_name_sdr, 'w', newline='') as csvfile_sdr:
+                idx = 0
+                writer_sisnr = csv.writer(csvfile_sisnr, quotechar='|', quoting=csv.QUOTE_MINIMAL)
+                writer_sdr = csv.writer(csvfile_sdr, quotechar='|', quoting=csv.QUOTE_MINIMAL)
+                for input_sizes, mixture, src, key in dataloader:
+                    if len(key) > 1:
+                        raise("batch size is not one!!")
+                    nnet_input = mixture.to(self.device)
+                    num_batch += 1
+                    pbar.update(1)
+                    estim_src, _ = torch.nn.parallel.data_parallel(self.model, nnet_input, device_ids=self.gpuid)
+                    cur_loss_SISNRi, cur_loss_SISNRi_src = self.PIT_SISNRi_loss(estims=estim_src, mixture=mixture, input_sizes=input_sizes, target_attr=src, eps=1.0e-15)
+                    total_loss_SISNRi += cur_loss_SISNRi.item() / self.config['model']['num_spks']
+                    cur_loss_SDRi, cur_loss_SDRi_src = self.PIT_SDRi_loss(estims=estim_src, mixture=mixture, input_sizes=input_sizes, target_attr=src)
+                    total_loss_SDRi += cur_loss_SDRi.item() / self.config['model']['num_spks']
+                    writer_sisnr.writerow([key[0][:-4]] + [cur_loss_SISNRi_src[i].item() for i in range(self.config['model']['num_spks'])])
+                    writer_sdr.writerow([key[0][:-4]] + [cur_loss_SDRi_src[i].item() for i in range(self.config['model']['num_spks'])])
+                    if self.engine_mode == "test_save":
+                        if wav_dir == None: wav_dir = os.path.join(os.path.dirname(__file__),"wav_out")
+                        if wav_dir and not os.path.exists(wav_dir): os.makedirs(wav_dir)
+                        mixture = torch.squeeze(mixture).cpu().data.numpy()
+                        sf.write(os.path.join(wav_dir,key[0][:-4]+str(idx)+'_mixture.wav'), 0.5*mixture/max(abs(mixture)), 8000)
+                        for i in range(self.config['model']['num_spks']):
+                            src = torch.squeeze(estim_src[i]).cpu().data.numpy()
+                            sf.write(os.path.join(wav_dir,key[0][:-4]+str(idx)+'_out_'+str(i)+'.wav'), 0.5*src/max(abs(src)), 8000)
+                    idx += 1
+                    dict_loss = {"SiSNRi": total_loss_SISNRi/num_batch, "SDRi": total_loss_SDRi/num_batch}
+                    pbar.set_postfix(dict_loss)
+        pbar.close()
+        return total_loss_SISNRi/num_batch, total_loss_SDRi/num_batch, num_batch
+    @logger_wraps()
+    def run(self):
+        with torch.cuda.device(self.device):
+            writer_src = SummaryWriter(os.path.join(os.path.dirname(os.path.abspath(__file__)), "log/tensorboard"))
+            if "test" in self.engine_mode:
+                on_test_start = time.time()
+                test_loss_src_time_1, test_loss_src_time_2, test_num_batch = self._test(self.dataloaders['test'], self.out_wav_dir)
+                on_test_end = time.time()
+                logger.info(f"[TEST] Loss(time/mini-batch) \n - Epoch {self.start_epoch:2d}: SISNRi = {test_loss_src_time_1:.4f} dB | SDRi = {test_loss_src_time_2:.4f} dB | Speed = ({on_test_end - on_test_start:.2f}s/{test_num_batch:d})")
+                logger.info(f"Testing done!")
+            else:
+                start_time = time.time()
+                if self.start_epoch > 1:
+                    init_loss_time, init_loss_freq, valid_num_batch = self._validate(self.dataloaders['valid'])
+                else:
+                    init_loss_time, init_loss_freq = 0, 0
+                end_time = time.time()
+                logger.info(f"[INIT] Loss(time/mini-batch) \n - Epoch {self.start_epoch:2d}: Loss_t = {init_loss_time:.4f} dB | Loss_f = {init_loss_freq:.4f} dB | Speed = ({end_time-start_time:.2f}s)")
+                for epoch in range(self.start_epoch, self.config['engine']['max_epoch']):
+                    valid_loss_best = init_loss_time
+                    train_start_time = time.time()
+                    train_loss_src_time, train_loss_src_freq, train_num_batch = self._train(self.dataloaders['train'], epoch)
+                    train_end_time = time.time()
+                    valid_start_time = time.time()
+                    valid_loss_src_time, valid_loss_src_freq, valid_num_batch = self._validate(self.dataloaders['valid'])
+                    valid_end_time = time.time()
+                    if epoch > self.config['engine']['start_scheduling']: self.main_scheduler.step(valid_loss_src_time)
+                    logger.info(f"[TRAIN] Loss(time/mini-batch) \n - Epoch {epoch:2d}: Loss_t = {train_loss_src_time:.4f} dB | Loss_f = {train_loss_src_freq:.4f} dB | Speed = ({train_end_time - train_start_time:.2f}s/{train_num_batch:d})")
+                    logger.info(f"[VALID] Loss(time/mini-batch) \n - Epoch {epoch:2d}: Loss_t = {valid_loss_src_time:.4f} dB | Loss_f = {valid_loss_src_freq:.4f} dB | Speed = ({valid_end_time - valid_start_time:.2f}s/{valid_num_batch:d})")
+                    if epoch in self.config['engine']['test_epochs']:
+                        on_test_start = time.time()
+                        test_loss_src_time_1, test_loss_src_time_2, test_num_batch = self._test(self.dataloaders['test'])
+                        on_test_end = time.time()
+                        logger.info(f"[TEST] Loss(time/mini-batch) \n - Epoch {epoch:2d}: SISNRi = {test_loss_src_time_1:.4f} dB | SDRi = {test_loss_src_time_2:.4f} dB | Speed = ({on_test_end - on_test_start:.2f}s/{test_num_batch:d})")
+                    valid_loss_best = util_engine.save_checkpoint_per_best(valid_loss_best, valid_loss_src_time, train_loss_src_time, epoch, self.model, self.main_optimizer, self.checkpoint_path)
+                    # Logging to monitoring tools (Tensorboard && Wandb)
+                    writer_src.add_scalars("Metrics", {
+                        'Loss_train_time': train_loss_src_time,
+                        'Loss_valid_time': valid_loss_src_time}, epoch)
+                    writer_src.add_scalar("Learning Rate", self.main_optimizer.param_groups[0]['lr'], epoch)
+                    writer_src.flush()
+                logger.info(f"Training for {self.config['engine']['max_epoch']} epoches done!")

models/SepReformer/SepReformer_Large_DM_WSJ0/main.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import torch
+from loguru import logger
+from .dataset import get_dataloaders
+from .model import Model
+from .engine import Engine
+from utils import util_system, util_implement
+from utils.decorators import *
+# Setup logger
+log_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log/system_log.log")
+logger.add(log_file_path, level="DEBUG", mode="w")
+@logger_wraps()
+def main(args):
+    ''' Build Setting '''
+    # Call configuration file (configs.yaml)
+    yaml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs.yaml")
+    yaml_dict = util_system.parse_yaml(yaml_path)
+    # Run wandb and get configuration
+    config = yaml_dict["config"] # wandb login success or fail
+    # Call DataLoader [train / valid / test / etc...]
+    dataloaders = get_dataloaders(args, config["dataset"], config["dataloader"])
+    ''' Build Model '''
+    # Call network model
+    model = Model(**config["model"])
+    ''' Build Engine '''
+    # Call gpu id & device
+    gpuid = tuple(map(int, config["engine"]["gpuid"].split(',')))
+    device = torch.device(f'cuda:{gpuid[0]}')
+    # Call Implement [criterion / optimizer / scheduler]
+    criterions = util_implement.CriterionFactory(config["criterion"], device).get_criterions()
+    optimizers = util_implement.OptimizerFactory(config["optimizer"], model.parameters()).get_optimizers()
+    schedulers = util_implement.SchedulerFactory(config["scheduler"], optimizers).get_schedulers()
+    # Call & Run Engine
+    engine = Engine(args, config, model, dataloaders, criterions, optimizers, schedulers, gpuid, device)
+    engine.run()

models/SepReformer/SepReformer_Large_DM_WSJ0/model.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import sys
+sys.path.append('../')
+import torch
+import warnings
+warnings.filterwarnings('ignore')
+from utils.decorators import *
+from .modules.module import *
+@logger_wraps()
+class Model(torch.nn.Module):
+    def __init__(self,
+                 num_stages: int,
+                 num_spks: int,
+                 module_audio_enc: dict,
+                 module_feature_projector: dict,
+                 module_separator: dict,
+                 module_output_layer: dict,
+                 module_audio_dec: dict):
+        super().__init__()
+        self.num_stages = num_stages
+        self.num_spks = num_spks
+        self.audio_encoder = AudioEncoder(**module_audio_enc)
+        self.feature_projector = FeatureProjector(**module_feature_projector)
+        self.separator = Separator(**module_separator)
+        self.out_layer = OutputLayer(**module_output_layer)
+        self.audio_decoder = AudioDecoder(**module_audio_dec)
+        # Aux_loss
+        self.out_layer_bn = torch.nn.ModuleList([])
+        self.decoder_bn = torch.nn.ModuleList([])
+        for _ in range(self.num_stages):
+            self.out_layer_bn.append(OutputLayer(**module_output_layer, masking=True))
+            self.decoder_bn.append(AudioDecoder(**module_audio_dec))
+    def forward(self, x):
+        encoder_output = self.audio_encoder(x)
+        projected_feature = self.feature_projector(encoder_output)
+        last_stage_output, each_stage_outputs = self.separator(projected_feature)
+        out_layer_output = self.out_layer(last_stage_output, encoder_output)
+        each_spk_output = [out_layer_output[idx] for idx in range(self.num_spks)]
+        audio = [self.audio_decoder(each_spk_output[idx]) for idx in range(self.num_spks)]
+        # Aux_loss
+        audio_aux = []
+        for idx, each_stage_output in enumerate(each_stage_outputs):
+            each_stage_output = self.out_layer_bn[idx](torch.nn.functional.upsample(each_stage_output, encoder_output.shape[-1]), encoder_output)
+            out_aux = [each_stage_output[jdx] for jdx in range(self.num_spks)]
+            audio_aux.append([self.decoder_bn[idx](out_aux[jdx])[...,:x.shape[-1]] for jdx in range(self.num_spks)])
+        return audio, audio_aux

models/SepReformer/SepReformer_Large_DM_WSJ0/modules/module.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import sys
+sys.path.append('../')
+import torch
+import warnings
+warnings.filterwarnings('ignore')
+from utils.decorators import *
+from .network import *
+class AudioEncoder(torch.nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int, groups: int, bias: bool):
+        super().__init__()
+        self.conv1d = torch.nn.Conv1d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, groups=groups, bias=bias)
+        self.gelu = torch.nn.GELU()
+    def forward(self, x: torch.Tensor):
+        x = torch.unsqueeze(x, dim=0) if len(x.shape) == 1 else torch.unsqueeze(x, dim=1) # [T] - >[1, T] OR [B, T] -> [B, 1, T]
+        x = self.conv1d(x)
+        x = self.gelu(x)
+        return x
+class FeatureProjector(torch.nn.Module):
+    def __init__(self, num_channels: int, in_channels: int, out_channels: int, kernel_size: int, bias: bool):
+        super().__init__()
+        self.norm = torch.nn.GroupNorm(num_groups=1, num_channels=num_channels, eps=1e-8)
+        self.conv1d = torch.nn.Conv1d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, bias=bias)
+    def forward(self, x: torch.Tensor):
+        x = self.norm(x)
+        x = self.conv1d(x)
+        return x
+class Separator(torch.nn.Module):
+    def __init__(self, num_stages: int, relative_positional_encoding: dict, enc_stage: dict, spk_split_stage: dict, simple_fusion:dict, dec_stage: dict):
+        super().__init__()
+        class RelativePositionalEncoding(torch.nn.Module):
+            def __init__(self, in_channels: int, num_heads: int, maxlen: int, embed_v=False):
+                super().__init__()
+                self.in_channels = in_channels
+                self.num_heads = num_heads
+                self.embedding_dim = self.in_channels // self.num_heads
+                self.maxlen = maxlen
+                self.pe_k = torch.nn.Embedding(num_embeddings=2*maxlen, embedding_dim=self.embedding_dim)
+                self.pe_v = torch.nn.Embedding(num_embeddings=2*maxlen, embedding_dim=self.embedding_dim) if embed_v else None
+            def forward(self, pos_seq: torch.Tensor):
+                pos_seq.clamp_(-self.maxlen, self.maxlen - 1)
+                pos_seq += self.maxlen
+                pe_k_output = self.pe_k(pos_seq)
+                pe_v_output = self.pe_v(pos_seq) if self.pe_v is not None else None
+                return pe_k_output, pe_v_output
+        class SepEncStage(torch.nn.Module):
+            def __init__(self, global_blocks: dict, local_blocks: dict, down_conv_layer: dict, down_conv=True):
+                super().__init__()
+                class DownConvLayer(torch.nn.Module):
+                    def __init__(self, in_channels: int, samp_kernel_size: int):
+                        """Construct an EncoderLayer object."""
+                        super().__init__()
+                        self.down_conv = torch.nn.Conv1d(
+                            in_channels=in_channels, out_channels=in_channels, kernel_size=samp_kernel_size, stride=2, padding=(samp_kernel_size-1)//2, groups=in_channels)
+                        self.BN = torch.nn.BatchNorm1d(num_features=in_channels)
+                        self.gelu = torch.nn.GELU()
+                    def forward(self, x: torch.Tensor):
+                        x = x.permute([0, 2, 1])
+                        x = self.down_conv(x)
+                        x = self.BN(x)
+                        x = self.gelu(x)
+                        x = x.permute([0, 2, 1])
+                        return x
+                self.g_block_1 = GlobalBlock(**global_blocks)
+                self.l_block_1 = LocalBlock(**local_blocks)
+                self.g_block_2 = GlobalBlock(**global_blocks)
+                self.l_block_2 = LocalBlock(**local_blocks)
+                self.downconv = DownConvLayer(**down_conv_layer) if down_conv == True else None
+            def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+                '''
+                x: [B, N, T]
+                '''
+                x = self.g_block_1(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_1(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.g_block_2(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_2(x)
+                x = x.permute(0, 2, 1).contiguous()
+                skip = x
+                if self.downconv:
+                    x = x.permute(0, 2, 1).contiguous()
+                    x = self.downconv(x)
+                    x = x.permute(0, 2, 1).contiguous()
+                # [BK, S, N]
+                return x, skip
+        class SpkSplitStage(torch.nn.Module):
+            def __init__(self, in_channels: int, num_spks: int):
+                super().__init__()
+                self.linear = torch.nn.Sequential(
+                    torch.nn.Conv1d(in_channels, 4*in_channels*num_spks, kernel_size=1),
+                    torch.nn.GLU(dim=-2),
+                    torch.nn.Conv1d(2*in_channels*num_spks, in_channels*num_spks, kernel_size=1))
+                self.norm = torch.nn.GroupNorm(1, in_channels, eps=1e-8)
+                self.num_spks = num_spks
+            def forward(self, x: torch.Tensor):
+                x = self.linear(x)
+                B, _, T = x.shape
+                x = x.view(B*self.num_spks,-1, T).contiguous()
+                x = self.norm(x)
+                return x
+        class SepDecStage(torch.nn.Module):
+            def __init__(self, num_spks: int, global_blocks: dict, local_blocks: dict, spk_attention: dict):
+                super().__init__()
+                self.g_block_1 = GlobalBlock(**global_blocks)
+                self.l_block_1 = LocalBlock(**local_blocks)
+                self.spk_attn_1 = SpkAttention(**spk_attention)
+                self.g_block_2 = GlobalBlock(**global_blocks)
+                self.l_block_2 = LocalBlock(**local_blocks)
+                self.spk_attn_2 = SpkAttention(**spk_attention)
+                self.g_block_3 = GlobalBlock(**global_blocks)
+                self.l_block_3 = LocalBlock(**local_blocks)
+                self.spk_attn_3 = SpkAttention(**spk_attention)
+                self.num_spk = num_spks
+            def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+                '''
+                x: [B, N, T]
+                '''
+                # [BS, K, H]
+                x = self.g_block_1(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_1(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.spk_attn_1(x, self.num_spk)
+                x = self.g_block_2(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_2(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.spk_attn_2(x, self.num_spk)
+                x = self.g_block_3(x, pos_k)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.l_block_3(x)
+                x = x.permute(0, 2, 1).contiguous()
+                x = self.spk_attn_3(x, self.num_spk)
+                skip = x
+                return x, skip
+        self.num_stages = num_stages
+        self.pos_emb = RelativePositionalEncoding(**relative_positional_encoding)
+        # Temporal Contracting Part
+        self.enc_stages = torch.nn.ModuleList([])
+        for _ in range(self.num_stages):
+            self.enc_stages.append(SepEncStage(**enc_stage, down_conv=True))
+        self.bottleneck_G = SepEncStage(**enc_stage, down_conv=False)
+        self.spk_split_block = SpkSplitStage(**spk_split_stage)
+        # Temporal Expanding Part
+        self.simple_fusion = torch.nn.ModuleList([])
+        self.dec_stages = torch.nn.ModuleList([])
+        for _ in range(self.num_stages):
+            self.simple_fusion.append(torch.nn.Conv1d(in_channels=simple_fusion['out_channels']*2,out_channels=simple_fusion['out_channels'], kernel_size=1))
+            self.dec_stages.append(SepDecStage(**dec_stage))
+    def forward(self, input: torch.Tensor):
+        '''input: [B, N, L]'''
+        # feature projection
+        x, _ = self.pad_signal(input)
+        len_x = x.shape[-1]
+        # Temporal Contracting Part
+        pos_seq = torch.arange(0, len_x//2**self.num_stages).long().to(x.device)
+        pos_seq = pos_seq[:, None] - pos_seq[None, :]
+        pos_k, _ = self.pos_emb(pos_seq)
+        skip = []
+        for idx in range(self.num_stages):
+            x, skip_ = self.enc_stages[idx](x, pos_k)
+            skip_ = self.spk_split_block(skip_)
+            skip.append(skip_)
+        x, _ = self.bottleneck_G(x, pos_k)
+        x = self.spk_split_block(x) # B, 2F, T
+        each_stage_outputs = []
+        # Temporal Expanding Part
+        for idx in range(self.num_stages):
+            each_stage_outputs.append(x)
+            idx_en = self.num_stages - (idx + 1)
+            x = torch.nn.functional.upsample(x, skip[idx_en].shape[-1])
+            x = torch.cat([x,skip[idx_en]],dim=1)
+            x = self.simple_fusion[idx](x)
+            x, _ = self.dec_stages[idx](x, pos_k)
+        last_stage_output = x
+        return last_stage_output, each_stage_outputs
+    def pad_signal(self, input: torch.Tensor):
+        #  (B, T) or (B, 1, T)
+        if input.dim() == 1: input = input.unsqueeze(0)
+        elif input.dim() not in [2, 3]: raise RuntimeError("Input can only be 2 or 3 dimensional.")
+        elif input.dim() == 2: input = input.unsqueeze(1)
+        L = 2**self.num_stages
+        batch_size = input.size(0)
+        ndim = input.size(1)
+        nframe = input.size(2)
+        padded_len = (nframe//L + 1)*L
+        rest = 0 if nframe%L == 0 else padded_len - nframe
+        if rest > 0:
+            pad = torch.autograd.Variable(torch.zeros(batch_size, ndim, rest)).type(input.type()).to(input.device)
+            input = torch.cat([input, pad], dim=-1)
+        return input, rest
+class OutputLayer(torch.nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, num_spks: int, masking: bool = False):
+        super().__init__()
+        # feature expansion back
+        self.masking = masking
+        self.spe_block = Masking(in_channels, Activation_mask="ReLU", concat_opt=None)
+        self.num_spks = num_spks
+        self.end_conv1x1 = torch.nn.Sequential(
+            torch.nn.Linear(out_channels, 4*out_channels),
+            torch.nn.GLU(),
+            torch.nn.Linear(2*out_channels, in_channels))
+    def forward(self, x: torch.Tensor, input: torch.Tensor):
+        x = x[...,:input.shape[-1]]
+        x = x.permute([0, 2, 1])
+        x = self.end_conv1x1(x)
+        x = x.permute([0, 2, 1])
+        B, N, L = x.shape
+        B = B // self.num_spks
+        if self.masking:
+            input = input.expand(self.num_spks, B, N, L).transpose(0,1).contiguous()
+            input = input.view(B*self.num_spks, N, L)
+            x = self.spe_block(x, input)
+        x = x.view(B, self.num_spks, N, L)
+        # [spks, B, N, L]
+        x = x.transpose(0, 1)
+        return x
+class AudioDecoder(torch.nn.ConvTranspose1d):
+    '''
+        Decoder of the TasNet
+        This module can be seen as the gradient of Conv1d with respect to its input.
+        It is also known as a fractionally-strided convolution
+        or a deconvolution (although it is not an actual deconvolution operation).
+    '''
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x):
+        # x: [B, N, L]
+        if x.dim() not in [2, 3]: raise RuntimeError("{} accept 3/4D tensor as input".format(self.__name__))
+        x = super().forward(x if x.dim() == 3 else torch.unsqueeze(x, 1))
+        x = torch.squeeze(x, dim=1) if torch.squeeze(x).dim() == 1 else torch.squeeze(x)
+        return x

models/SepReformer/SepReformer_Large_DM_WSJ0/modules/network.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import torch
+import math
+import numpy
+from utils.decorators import *
+class LayerScale(torch.nn.Module):
+    def __init__(self, dims, input_size, Layer_scale_init=1.0e-5):
+        super().__init__()
+        if dims == 1:
+            self.layer_scale = torch.nn.Parameter(torch.ones(input_size)*Layer_scale_init, requires_grad=True)
+        elif dims == 2:
+            self.layer_scale = torch.nn.Parameter(torch.ones(1,input_size)*Layer_scale_init, requires_grad=True)
+        elif dims == 3:
+            self.layer_scale = torch.nn.Parameter(torch.ones(1,1,input_size)*Layer_scale_init, requires_grad=True)
+    def forward(self, x):
+        return x*self.layer_scale
+class Masking(torch.nn.Module):
+    def __init__(self, input_dim, Activation_mask='Sigmoid', **options):
+        super(Masking, self).__init__()
+        self.options = options
+        if self.options['concat_opt']:
+            self.pw_conv = torch.nn.Conv1d(input_dim*2, input_dim, 1, stride=1, padding=0)
+        if Activation_mask == 'Sigmoid':
+            self.gate_act = torch.nn.Sigmoid()
+        elif Activation_mask == 'ReLU':
+            self.gate_act = torch.nn.ReLU()
+    def forward(self, x, skip):
+        if self.options['concat_opt']:
+            y = torch.cat([x, skip], dim=-2)
+            y = self.pw_conv(y)
+        else:
+            y = x
+        y = self.gate_act(y) * skip
+        return y
+class GCFN(torch.nn.Module):
+    def __init__(self, in_channels, dropout_rate, Layer_scale_init=1.0e-5):
+        super().__init__()
+        self.net1 = torch.nn.Sequential(
+            torch.nn.LayerNorm(in_channels),
+            torch.nn.Linear(in_channels, in_channels*6))
+        self.depthwise = torch.nn.Conv1d(in_channels*6, in_channels*6, 3, padding=1, groups=in_channels*6)
+        self.net2 = torch.nn.Sequential(
+            torch.nn.GLU(),
+            torch.nn.Dropout(dropout_rate),
+            torch.nn.Linear(in_channels*3, in_channels),
+            torch.nn.Dropout(dropout_rate))
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x):
+        y = self.net1(x)
+        y = y.permute(0, 2, 1).contiguous()
+        y = self.depthwise(y)
+        y = y.permute(0, 2, 1).contiguous()
+        y = self.net2(y)
+        return x + self.Layer_scale(y)
+class MultiHeadAttention(torch.nn.Module):
+    """
+    Multi-Head Attention layer.
+        :param int n_head: the number of head s
+        :param int n_feat: the number of features
+        :param float dropout_rate: dropout rate
+    """
+    def __init__(self, n_head: int, in_channels: int, dropout_rate: float, Layer_scale_init=1.0e-5):
+        super().__init__()
+        assert in_channels % n_head == 0
+        self.d_k = in_channels // n_head # We assume d_v always equals d_k
+        self.h = n_head
+        self.layer_norm = torch.nn.LayerNorm(in_channels)
+        self.linear_q = torch.nn.Linear(in_channels, in_channels)
+        self.linear_k = torch.nn.Linear(in_channels, in_channels)
+        self.linear_v = torch.nn.Linear(in_channels, in_channels)
+        self.linear_out = torch.nn.Linear(in_channels, in_channels)
+        self.attn = None
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x, pos_k, mask):
+        """
+        Compute 'Scaled Dot Product Attention'.
+            :param torch.Tensor mask: (batch, time1, time2)
+            :param torch.nn.Dropout dropout:
+            :return torch.Tensor: attentined and transformed `value` (batch, time1, d_model)
+            weighted by the query dot key attention (batch, head, time1, time2)
+        """
+        n_batch = x.size(0)
+        x = self.layer_norm(x)
+        q = self.linear_q(x).view(n_batch, -1, self.h, self.d_k)  #(b, t, d)
+        k = self.linear_k(x).view(n_batch, -1, self.h, self.d_k)  #(b, t, d)
+        v = self.linear_v(x).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        A = torch.matmul(q, k.transpose(-2, -1))
+        reshape_q = q.contiguous().view(n_batch * self.h, -1, self.d_k).transpose(0,1)
+        if pos_k is not None:
+            B = torch.matmul(reshape_q, pos_k.transpose(-2, -1))
+            B = B.transpose(0, 1).view(n_batch, self.h, pos_k.size(0), pos_k.size(1))
+            scores = (A + B) / math.sqrt(self.d_k)
+        else:
+            scores = A / math.sqrt(self.d_k)
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
+            min_value = float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
+            scores = scores.masked_fill(mask, min_value)
+            self.attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0)  # (batch, head, time1, time2)
+        else:
+            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(self.attn)
+        x = torch.matmul(p_attn, v)  # (batch, head, time1, d_k)
+        x = x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+        return self.Layer_scale(self.dropout(self.linear_out(x)))  # (batch, time1, d_model)
+class EGA(torch.nn.Module):
+    def __init__(self, in_channels: int, num_mha_heads: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'self_attn': MultiHeadAttention(
+                n_head=num_mha_heads, in_channels=in_channels, dropout_rate=dropout_rate),
+            'linear': torch.nn.Sequential(
+                torch.nn.LayerNorm(normalized_shape=in_channels),
+                torch.nn.Linear(in_features=in_channels, out_features=in_channels),
+                torch.nn.Sigmoid())
+        })
+    def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+        """
+        Compute encoded features.
+            :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+            :param torch.Tensor mask: mask for x (batch, max_time_in)
+            :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        down_len = pos_k.shape[0]
+        x_down = torch.nn.functional.adaptive_avg_pool1d(input=x, output_size=down_len)
+        x = x.permute([0, 2, 1])
+        x_down = x_down.permute([0, 2, 1])
+        x_down = self.block['self_attn'](x_down, pos_k, None)
+        x_down = x_down.permute([0, 2, 1])
+        x_downup = torch.nn.functional.upsample(input=x_down, size=x.shape[1])
+        x_downup = x_downup.permute([0, 2, 1])
+        x = x + self.block['linear'](x) * x_downup
+        return x
+class CLA(torch.nn.Module):
+    def __init__(self, in_channels, kernel_size, dropout_rate, Layer_scale_init=1.0e-5):
+        super().__init__()
+        self.layer_norm = torch.nn.LayerNorm(in_channels)
+        self.linear1 = torch.nn.Linear(in_channels, in_channels*2)
+        self.GLU = torch.nn.GLU()
+        self.dw_conv_1d = torch.nn.Conv1d(in_channels, in_channels, kernel_size, padding='same', groups=in_channels)
+        self.linear2 = torch.nn.Linear(in_channels, 2*in_channels)
+        self.BN = torch.nn.BatchNorm1d(2*in_channels)
+        self.linear3 = torch.nn.Sequential(
+            torch.nn.GELU(),
+            torch.nn.Linear(2*in_channels, in_channels),
+            torch.nn.Dropout(dropout_rate))
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x):
+        y = self.layer_norm(x)
+        y = self.linear1(y)
+        y = self.GLU(y)
+        y = y.permute([0, 2, 1]) # B, F, T
+        y = self.dw_conv_1d(y)
+        y = y.permute(0, 2, 1) # B, T, 2F
+        y = self.linear2(y)
+        y = y.permute(0, 2, 1) # B, T, 2F
+        y = self.BN(y)
+        y = y.permute(0, 2, 1) # B, T, 2F
+        y = self.linear3(y)
+        return x + self.Layer_scale(y)
+class GlobalBlock(torch.nn.Module):
+    def __init__(self, in_channels: int, num_mha_heads: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'ega': EGA(
+                num_mha_heads=num_mha_heads, in_channels=in_channels, dropout_rate=dropout_rate),
+            'gcfn': GCFN(in_channels=in_channels, dropout_rate=dropout_rate)
+        })
+    def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+        """
+        Compute encoded features.
+            :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+            :param torch.Tensor mask: mask for x (batch, max_time_in)
+            :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        x = self.block['ega'](x, pos_k)
+        x = self.block['gcfn'](x)
+        x = x.permute([0, 2, 1])
+        return x
+class LocalBlock(torch.nn.Module):
+    def __init__(self, in_channels: int, kernel_size: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'cla': CLA(in_channels, kernel_size, dropout_rate),
+            'gcfn': GCFN(in_channels, dropout_rate)
+        })
+    def forward(self, x: torch.Tensor):
+        x = self.block['cla'](x)
+        x = self.block['gcfn'](x)
+        return x
+class SpkAttention(torch.nn.Module):
+    def __init__(self, in_channels: int, num_mha_heads: int, dropout_rate: float):
+        super().__init__()
+        self.self_attn = MultiHeadAttention(n_head=num_mha_heads, in_channels=in_channels, dropout_rate=dropout_rate)
+        self.feed_forward = GCFN(in_channels=in_channels, dropout_rate=dropout_rate)
+    def forward(self, x: torch.Tensor, num_spk: int):
+        """
+        Compute encoded features.
+            :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+            :param torch.Tensor mask: mask for x (batch, max_time_in)
+            :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        B, F, T = x.shape
+        x = x.view(B//num_spk, num_spk, F, T).contiguous()
+        x = x.permute([0, 3, 1, 2]).contiguous()
+        x = x.view(-1, num_spk, F).contiguous()
+        x = x + self.self_attn(x, None, None)
+        x = x.view(B//num_spk, T, num_spk, F).contiguous()
+        x = x.permute([0, 2, 3, 1]).contiguous()
+        x = x.view(B, F, T).contiguous()
+        x = x.permute([0, 2, 1])
+        x = self.feed_forward(x)
+        x = x.permute([0, 2, 1])
+        return x

models/SepReformer/source.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://github.com/dmlguq456/SepReformer