| import os |
| import os.path as osp |
| import sys |
| import time |
| from collections import defaultdict |
|
|
| import matplotlib |
| import numpy as np |
| import soundfile as sf |
| import torch |
| from torch import nn |
| import jiwer |
|
|
| import matplotlib.pylab as plt |
| import functools |
| import os |
| import random |
| import traceback |
| from pathlib import Path |
| from typing import Any, Dict, List, Optional, Tuple |
|
|
| import librosa |
| import numpy as np |
| import torch |
| from einops import rearrange |
| from scipy import ndimage |
| from torch.special import gammaln |
|
|
|
|
| def calc_wer(target, pred, ignore_indexes=[0]): |
| target_chars = drop_duplicated(list(filter(lambda x: x not in ignore_indexes, map(str, list(target))))) |
| pred_chars = drop_duplicated(list(filter(lambda x: x not in ignore_indexes, map(str, list(pred))))) |
| target_str = ' '.join(target_chars) |
| pred_str = ' '.join(pred_chars) |
| error = jiwer.wer(target_str, pred_str) |
| return error |
|
|
| def drop_duplicated(chars): |
| ret_chars = [chars[0]] |
| for prev, curr in zip(chars[:-1], chars[1:]): |
| if prev != curr: |
| ret_chars.append(curr) |
| return ret_chars |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| def build_criterion(critic_params={}): |
| criterion = { |
| "ce": nn.CrossEntropyLoss(ignore_index=-1), |
| "ctc": torch.nn.CTCLoss(**critic_params.get('ctc', {})), |
| } |
| return criterion |
|
|
|
|
|
|
| def get_data_path_list(train_path=None, val_path=None): |
| if train_path is None: |
| train_path = "Data/train_list.txt" |
| if val_path is None: |
| val_path = "Data/val_list.txt" |
|
|
| with open(train_path, 'r') as f: |
| train_list = f.readlines() |
| with open(val_path, 'r') as f: |
| val_list = f.readlines() |
|
|
| return train_list, val_list |
|
|
|
|
| def plot_image(image): |
| fig, ax = plt.subplots(figsize=(10, 2)) |
| im = ax.imshow(image, aspect="auto", origin="lower", |
| interpolation='none') |
|
|
| fig.canvas.draw() |
| plt.close() |
|
|
| return fig |
|
|
|
|
|
|
| class PartialConv1d(torch.nn.Conv1d): |
| """ |
| Zero padding creates a unique identifier for where the edge of the data is, such that the model can almost always identify |
| exactly where it is relative to either edge given a sufficient receptive field. Partial padding goes to some lengths to remove |
| this affect. |
| """ |
|
|
| __constants__ = ['slide_winsize'] |
| slide_winsize: float |
|
|
| def __init__(self, *args, **kwargs): |
| super(PartialConv1d, self).__init__(*args, **kwargs) |
| weight_maskUpdater = torch.ones(1, 1, self.kernel_size[0]) |
| self.register_buffer("weight_maskUpdater", weight_maskUpdater, persistent=False) |
| self.slide_winsize = self.weight_maskUpdater.shape[1] * self.weight_maskUpdater.shape[2] |
|
|
| def forward(self, input, mask_in): |
| if mask_in is None: |
| mask = torch.ones(1, 1, input.shape[2], dtype=input.dtype, device=input.device) |
| else: |
| mask = mask_in |
| input = torch.mul(input, mask) |
| with torch.no_grad(): |
| update_mask = F.conv1d( |
| mask, |
| self.weight_maskUpdater, |
| bias=None, |
| stride=self.stride, |
| padding=self.padding, |
| dilation=self.dilation, |
| groups=1, |
| ) |
| update_mask_filled = torch.masked_fill(update_mask, update_mask == 0, self.slide_winsize) |
| mask_ratio = self.slide_winsize / update_mask_filled |
| update_mask = torch.clamp(update_mask, 0, 1) |
| mask_ratio = torch.mul(mask_ratio, update_mask) |
|
|
| raw_out = self._conv_forward(input, self.weight, self.bias) |
|
|
| if self.bias is not None: |
| bias_view = self.bias.view(1, self.out_channels, 1) |
| output = torch.mul(raw_out - bias_view, mask_ratio) + bias_view |
| output = torch.mul(output, update_mask) |
| else: |
| output = torch.mul(raw_out, mask_ratio) |
|
|
| return output |
|
|
|
|
| class LinearNorm(torch.nn.Module): |
| def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): |
| super().__init__() |
| self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) |
|
|
| torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) |
|
|
| def forward(self, x): |
| return self.linear_layer(x) |
|
|
|
|
| class ConvNorm(torch.nn.Module): |
| __constants__ = ['use_partial_padding'] |
| use_partial_padding: bool |
|
|
| def __init__( |
| self, |
| in_channels, |
| out_channels, |
| kernel_size=1, |
| stride=1, |
| padding=None, |
| dilation=1, |
| bias=True, |
| w_init_gain='linear', |
| use_partial_padding=False, |
| use_weight_norm=False, |
| norm_fn=None, |
| ): |
| super(ConvNorm, self).__init__() |
| if padding is None: |
| assert kernel_size % 2 == 1 |
| padding = int(dilation * (kernel_size - 1) / 2) |
| self.use_partial_padding = use_partial_padding |
| conv_fn = torch.nn.Conv1d |
| if use_partial_padding: |
| conv_fn = PartialConv1d |
| self.conv = conv_fn( |
| in_channels, |
| out_channels, |
| kernel_size=kernel_size, |
| stride=stride, |
| padding=padding, |
| dilation=dilation, |
| bias=bias, |
| ) |
| torch.nn.init.xavier_uniform_(self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) |
| if use_weight_norm: |
| self.conv = torch.nn.utils.weight_norm(self.conv) |
| if norm_fn is not None: |
| self.norm = norm_fn(out_channels, affine=True) |
| else: |
| self.norm = None |
|
|
| def forward(self, signal, mask=None): |
| if self.use_partial_padding: |
| ret = self.conv(signal, mask) |
| if self.norm is not None: |
| ret = self.norm(ret, mask) |
| else: |
| if mask is not None: |
| signal = signal.mul(mask) |
| ret = self.conv(signal) |
| if self.norm is not None: |
| ret = self.norm(ret) |
|
|
| |
| |
|
|
| return ret |
|
|
|
|
|
|
| class BetaBinomialInterpolator: |
| """ |
| This module calculates alignment prior matrices (based on beta-binomial distribution) using cached popular sizes and image interpolation. |
| The implementation is taken from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py |
| """ |
|
|
| def __init__(self, round_mel_len_to=50, round_text_len_to=10, cache_size=500, scaling_factor: float = 1.0): |
| self.round_mel_len_to = round_mel_len_to |
| self.round_text_len_to = round_text_len_to |
| cached_func = lambda x, y: beta_binomial_prior_distribution(x, y, scaling_factor=scaling_factor) |
| self.bank = functools.lru_cache(maxsize=cache_size)(cached_func) |
|
|
| @staticmethod |
| def round(val, to): |
| return max(1, int(np.round((val + 1) / to))) * to |
|
|
| def __call__(self, w, h): |
| bw = BetaBinomialInterpolator.round(w, to=self.round_mel_len_to) |
| bh = BetaBinomialInterpolator.round(h, to=self.round_text_len_to) |
| ret = ndimage.zoom(self.bank(bw, bh).T, zoom=(w / bw, h / bh), order=1) |
| assert ret.shape[0] == w, ret.shape |
| assert ret.shape[1] == h, ret.shape |
| return ret |
|
|
|
|
| def general_padding(item, item_len, max_len, pad_value=0): |
| if item_len < max_len: |
| item = torch.nn.functional.pad(item, (0, max_len - item_len), value=pad_value) |
| return item |
|
|
|
|
| def stack_tensors(tensors: List[torch.Tensor], max_lens: List[int], pad_value: float = 0.0) -> torch.Tensor: |
| """ |
| Create batch by stacking input tensor list along the time axes. |
| |
| Args: |
| tensors: List of tensors to pad and stack |
| max_lens: List of lengths to pad each axis to, starting with the last axis |
| pad_value: Value for padding |
| |
| Returns: |
| Padded and stacked tensor. |
| """ |
| padded_tensors = [] |
| for tensor in tensors: |
| padding = [] |
| for i, max_len in enumerate(max_lens, 1): |
| padding += [0, max_len - tensor.shape[-i]] |
|
|
| padded_tensor = torch.nn.functional.pad(tensor, pad=padding, value=pad_value) |
| padded_tensors.append(padded_tensor) |
|
|
| stacked_tensor = torch.stack(padded_tensors) |
| return stacked_tensor |
|
|
|
|
| def logbeta(x, y): |
| return gammaln(x) + gammaln(y) - gammaln(x + y) |
|
|
|
|
| def logcombinations(n, k): |
| return gammaln(n + 1) - gammaln(k + 1) - gammaln(n - k + 1) |
|
|
|
|
| def logbetabinom(n, a, b, x): |
| return logcombinations(n, x) + logbeta(x + a, n - x + b) - logbeta(a, b) |
|
|
|
|
| def beta_binomial_prior_distribution(phoneme_count: int, mel_count: int, scaling_factor: float = 1.0) -> np.array: |
| x = rearrange(torch.arange(0, phoneme_count), "b -> 1 b") |
| y = rearrange(torch.arange(1, mel_count + 1), "b -> b 1") |
| a = scaling_factor * y |
| b = scaling_factor * (mel_count + 1 - y) |
| n = torch.FloatTensor([phoneme_count - 1]) |
|
|
| return logbetabinom(n, a, b, x).exp().numpy() |
|
|
|
|
| |
|
|