| import numpy as np |
| import torch |
| import librosa |
| from librosa.core import load |
| import matplotlib.pyplot as plt |
| import pysptk |
| import pyworld as pw |
| from fastdtw import fastdtw |
| from scipy import spatial |
|
|
| from librosa.filters import mel as librosa_mel_fn |
| mel_basis = librosa_mel_fn(sr=24000, n_fft=1024, n_mels=100, fmin=0, fmax=12000) |
|
|
|
|
| def _get_best_mcep_params(fs): |
| if fs == 16000: |
| return 23, 0.42 |
| elif fs == 22050: |
| return 34, 0.45 |
| elif fs == 24000: |
| return 34, 0.46 |
| elif fs == 44100: |
| return 39, 0.53 |
| elif fs == 48000: |
| return 39, 0.55 |
| else: |
| raise ValueError(f"Not found the setting for {fs}.") |
|
|
|
|
| def get_mel(wav_path): |
| wav, _ = load(wav_path, sr=24000) |
| wav = wav[:(wav.shape[0] // 256)*256] |
| wav = np.pad(wav, 384, mode='reflect') |
| stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False) |
| stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9)) |
| mel_spectrogram = np.matmul(mel_basis, stftm) |
| if mel_spectrogram.shape[-1] % 8 != 0: |
| mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, 8 - mel_spectrogram.shape[-1] % 8)), 'minimum') |
|
|
| log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None)) |
| return log_mel_spectrogram |
|
|
|
|
| def get_world_mel(wav_path=None, sr=24000, wav=None): |
| if wav_path is not None: |
| wav, _ = librosa.load(wav_path, sr=24000) |
| wav = (wav * 32767).astype(np.int16) |
| wav = (wav / 32767).astype(np.float64) |
| |
| wav = wav[:(wav.shape[0] // 256) * 256] |
|
|
| |
| _f0, t = pw.dio(wav, sr) |
| f0 = pw.stonemask(wav, _f0, t, sr) |
| sp = pw.cheaptrick(wav, f0, t, sr) |
| ap = pw.d4c(wav, f0, t, sr) |
| wav_hat = pw.synthesize(f0 * 0, sp, ap, sr) |
| |
|
|
| |
| wav_hat = wav_hat[:len(wav)] |
| |
| assert len(wav_hat) == len(wav) |
| wav = wav_hat.astype(np.float32) |
| wav = np.pad(wav, 384, mode='reflect') |
| stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False) |
| stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9)) |
| mel_spectrogram = np.matmul(mel_basis, stftm) |
| if mel_spectrogram.shape[-1] % 8 != 0: |
| mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, 8 - mel_spectrogram.shape[-1] % 8)), 'minimum') |
|
|
| log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None)) |
| return log_mel_spectrogram |
|
|
|
|
| def get_f0(wav_path, method='pyin', padding=True): |
| if method == 'pyin': |
| wav, sr = load(wav_path, sr=24000) |
| wav = wav[:(wav.shape[0] // 256) * 256] |
| wav = np.pad(wav, 384, mode='reflect') |
| f0, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=256, center=False, sr=24000, |
| fmin=librosa.note_to_hz('C2'), |
| fmax=librosa.note_to_hz('C6'), fill_na=0) |
| elif method == 'world': |
| wav, sr = librosa.load(wav_path, sr=24000) |
| wav = (wav * 32767).astype(np.int16) |
| wav = (wav / 32767).astype(np.float64) |
| _f0, t = pw.dio(wav, fs=24000, frame_period=256/sr*1000, |
| f0_floor=librosa.note_to_hz('C2'), |
| f0_ceil=librosa.note_to_hz('C6')) |
| f0 = pw.stonemask(wav, _f0, t, sr) |
| f0 = f0[:-1] |
|
|
| if padding is True: |
| if f0.shape[-1] % 8 !=0: |
| f0 = np.pad(f0, ((0, 8-f0.shape[-1] % 8)), 'constant', constant_values=0) |
|
|
| return f0 |
|
|
|
|
| def get_mcep(x, n_fft=1024, n_shift=256, sr=24000): |
| x, sr = load(x, sr=24000) |
| n_frame = (x.shape[0] // 256) |
| x = np.pad(x, 384, mode='reflect') |
| |
| win = pysptk.sptk.hamming(n_fft) |
| mcep_dim, mcep_alpha = _get_best_mcep_params(sr) |
| mcep = [pysptk.mcep(x[n_shift * i: n_shift * i + n_fft] * win, |
| mcep_dim, mcep_alpha, |
| eps=1e-6, etype=1,) |
| for i in range(n_frame) |
| ] |
| mcep = np.stack(mcep) |
| return mcep |
|
|
|
|
| def get_matched_f0(x, y, method='world', n_fft=1024, n_shift=256): |
| |
| f0_y = get_f0(y, method=method, padding=False) |
| |
| |
|
|
| mcep_x = get_mcep(x, n_fft=n_fft, n_shift=n_shift) |
| mcep_y = get_mcep(y, n_fft=n_fft, n_shift=n_shift) |
|
|
| _, path = fastdtw(mcep_x, mcep_y, dist=spatial.distance.euclidean) |
| twf = np.array(path).T |
| |
| nearest = [] |
| for i in range(len(f0_y)): |
| idx = np.argmax(1 * twf[0] == i) |
| nearest.append(twf[1][idx]) |
|
|
| f0_y = f0_y[nearest] |
|
|
| |
|
|
| if f0_y.shape[-1] % 8 != 0: |
| f0_y = np.pad(f0_y, ((0, 8 - f0_y.shape[-1] % 8)), 'constant', constant_values=0) |
|
|
| return f0_y |
|
|
|
|
| def f0_to_coarse(f0, hparams): |
|
|
| f0_bin = hparams['f0_bin'] |
| f0_max = hparams['f0_max'] |
| f0_min = hparams['f0_min'] |
| is_torch = isinstance(f0, torch.Tensor) |
| |
| f0_mel_min = 1127 * np.log(1 + f0_min / 700) |
| f0_mel_max = 1127 * np.log(1 + f0_max / 700) |
| f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) |
|
|
| unvoiced = (f0_mel == 0) |
|
|
| f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 |
|
|
| f0_mel[f0_mel <= 1] = 1 |
| f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 |
|
|
| f0_mel[unvoiced] = 0 |
|
|
| f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int) |
| assert f0_coarse.max() <= 255 and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min()) |
| return f0_coarse |
|
|
|
|
| def log_f0(f0, hparams): |
| f0_bin = hparams['f0_bin'] |
| f0_max = hparams['f0_max'] |
| f0_min = hparams['f0_min'] |
|
|
| f0_mel = np.zeros_like(f0) |
| f0_mel[f0 != 0] = 12*np.log2(f0[f0 != 0]/f0_min) + 1 |
| f0_mel_min = 12*np.log2(f0_min/f0_min) + 1 |
| f0_mel_max = 12*np.log2(f0_max/f0_min) + 1 |
|
|
| unvoiced = (f0_mel == 0) |
|
|
| f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 |
|
|
| f0_mel[f0_mel <= 1] = 1 |
| f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 |
|
|
| f0_mel[unvoiced] = 0 |
|
|
| f0_coarse = np.rint(f0_mel).astype(int) |
| assert f0_coarse.max() <= (f0_bin-1) and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min()) |
| return f0_coarse |
|
|
|
|
| def show_plot(tensor): |
| tensor = tensor.squeeze().cpu() |
| |
| fig, ax = plt.subplots(figsize=(12, 3)) |
| im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none') |
| plt.colorbar(im, ax=ax) |
| plt.tight_layout() |
| fig.canvas.draw() |
| plt.show() |
|
|
|
|
| if __name__ == '__main__': |
| mel = get_mel('target.wav') |
| f0 = get_f0('target.wav') |