| """ |
| Audio preprocessing for ASVspoof 2019 LA. |
| |
| Given a .flac path, produces a list of fixed-length 4-second segments at 16kHz |
| mono, ready to feed into Wav2Vec 2.0. |
| |
| Pipeline: |
| load .flac -> ensure mono -> ensure 16kHz -> window with 50% overlap |
| (short clips are zero-padded to one full window) |
| """ |
|
|
| from typing import List |
| import torch |
| import torchaudio |
| import torchaudio.functional as F |
|
|
|
|
| SAMPLE_RATE = 16000 |
| WINDOW_SECONDS = 4.0 |
| OVERLAP_RATIO = 0.5 |
|
|
| WINDOW_SAMPLES = int(SAMPLE_RATE * WINDOW_SECONDS) |
| HOP_SAMPLES = int(WINDOW_SAMPLES * (1.0 - OVERLAP_RATIO)) |
|
|
|
|
| def load_audio(path: str, target_sr: int = SAMPLE_RATE) -> torch.Tensor: |
| """Load a .flac file and return a 1-D mono waveform at target_sr.""" |
| waveform, sr = torchaudio.load(path) |
| if waveform.shape[0] > 1: |
| waveform = waveform.mean(dim=0, keepdim=True) |
| if sr != target_sr: |
| waveform = F.resample(waveform, orig_freq=sr, new_freq=target_sr) |
| return waveform.squeeze(0) |
|
|
|
|
| def segment_waveform( |
| waveform: torch.Tensor, |
| window_samples: int = WINDOW_SAMPLES, |
| hop_samples: int = HOP_SAMPLES, |
| ) -> List[torch.Tensor]: |
| """Split a 1-D waveform into fixed-length windows with overlap.""" |
| n = waveform.shape[0] |
| if n <= window_samples: |
| padded = torch.zeros(window_samples, dtype=waveform.dtype) |
| padded[:n] = waveform |
| return [padded] |
|
|
| windows = [] |
| start = 0 |
| while start < n: |
| end = start + window_samples |
| if end <= n: |
| windows.append(waveform[start:end]) |
| else: |
| tail = waveform[start:] |
| padded = torch.zeros(window_samples, dtype=waveform.dtype) |
| padded[:tail.shape[0]] = tail |
| windows.append(padded) |
| break |
| start += hop_samples |
| return windows |
|
|
|
|
| def preprocess(path: str) -> List[torch.Tensor]: |
| """Full pipeline: load + window.""" |
| waveform = load_audio(path) |
| return segment_waveform(waveform) |
|
|