File size: 1,990 Bytes
b95ff5a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | """
Audio preprocessing for ASVspoof 2019 LA.
Given a .flac path, produces a list of fixed-length 4-second segments at 16kHz
mono, ready to feed into Wav2Vec 2.0.
Pipeline:
load .flac -> ensure mono -> ensure 16kHz -> window with 50% overlap
(short clips are zero-padded to one full window)
"""
from typing import List
import torch
import torchaudio
import torchaudio.functional as F
SAMPLE_RATE = 16000
WINDOW_SECONDS = 4.0
OVERLAP_RATIO = 0.5
WINDOW_SAMPLES = int(SAMPLE_RATE * WINDOW_SECONDS) # 64000
HOP_SAMPLES = int(WINDOW_SAMPLES * (1.0 - OVERLAP_RATIO)) # 32000
def load_audio(path: str, target_sr: int = SAMPLE_RATE) -> torch.Tensor:
"""Load a .flac file and return a 1-D mono waveform at target_sr."""
waveform, sr = torchaudio.load(path)
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
if sr != target_sr:
waveform = F.resample(waveform, orig_freq=sr, new_freq=target_sr)
return waveform.squeeze(0)
def segment_waveform(
waveform: torch.Tensor,
window_samples: int = WINDOW_SAMPLES,
hop_samples: int = HOP_SAMPLES,
) -> List[torch.Tensor]:
"""Split a 1-D waveform into fixed-length windows with overlap."""
n = waveform.shape[0]
if n <= window_samples:
padded = torch.zeros(window_samples, dtype=waveform.dtype)
padded[:n] = waveform
return [padded]
windows = []
start = 0
while start < n:
end = start + window_samples
if end <= n:
windows.append(waveform[start:end])
else:
tail = waveform[start:]
padded = torch.zeros(window_samples, dtype=waveform.dtype)
padded[:tail.shape[0]] = tail
windows.append(padded)
break
start += hop_samples
return windows
def preprocess(path: str) -> List[torch.Tensor]:
"""Full pipeline: load + window."""
waveform = load_audio(path)
return segment_waveform(waveform)
|