| |
| |
| |
| |
|
|
| import torch |
| import numpy as np |
| from numpy import linalg as LA |
| import librosa |
| import soundfile as sf |
| import librosa.filters |
|
|
|
|
| def load_audio_torch(wave_file, fs): |
| """Load audio data into torch tensor |
| |
| Args: |
| wave_file (str): path to wave file |
| fs (int): sample rate |
| |
| Returns: |
| audio (tensor): audio data in tensor |
| fs (int): sample rate |
| """ |
|
|
| audio, sample_rate = librosa.load(wave_file, sr=fs, mono=True) |
| |
| assert len(audio) > 2 |
|
|
| |
| if np.issubdtype(audio.dtype, np.integer): |
| max_mag = -np.iinfo(audio.dtype).min |
| else: |
| max_mag = max(np.amax(audio), -np.amin(audio)) |
| max_mag = ( |
| (2**31) + 1 |
| if max_mag > (2**15) |
| else ((2**15) + 1 if max_mag > 1.01 else 1.0) |
| ) |
|
|
| |
| audio = torch.FloatTensor(audio.astype(np.float32)) / max_mag |
|
|
| if (torch.isnan(audio) | torch.isinf(audio)).any(): |
| return [], sample_rate or fs or 48000 |
|
|
| |
| if fs is not None and fs != sample_rate: |
| audio = torch.from_numpy( |
| librosa.core.resample(audio.numpy(), orig_sr=sample_rate, target_sr=fs) |
| ) |
| sample_rate = fs |
|
|
| return audio, fs |
|
|
|
|
| def _stft(y, cfg): |
| return librosa.stft( |
| y=y, n_fft=cfg.n_fft, hop_length=cfg.hop_size, win_length=cfg.win_size |
| ) |
|
|
|
|
| def energy(wav, cfg): |
| D = _stft(wav, cfg) |
| magnitudes = np.abs(D).T |
| return LA.norm(magnitudes, axis=1) |
|
|
|
|
| def get_energy_from_tacotron(audio, _stft): |
| audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1) |
| audio = torch.autograd.Variable(audio, requires_grad=False) |
| mel, energy = _stft.mel_spectrogram(audio) |
| energy = torch.squeeze(energy, 0).numpy().astype(np.float32) |
| return mel, energy |
|
|