deepfake-audio-detector / src /data /preprocessing.py
Saracasm
Phase 2: add preprocessing and dataset modules
b95ff5a
"""
Audio preprocessing for ASVspoof 2019 LA.
Given a .flac path, produces a list of fixed-length 4-second segments at 16kHz
mono, ready to feed into Wav2Vec 2.0.
Pipeline:
load .flac -> ensure mono -> ensure 16kHz -> window with 50% overlap
(short clips are zero-padded to one full window)
"""
from typing import List
import torch
import torchaudio
import torchaudio.functional as F
SAMPLE_RATE = 16000
WINDOW_SECONDS = 4.0
OVERLAP_RATIO = 0.5
WINDOW_SAMPLES = int(SAMPLE_RATE * WINDOW_SECONDS) # 64000
HOP_SAMPLES = int(WINDOW_SAMPLES * (1.0 - OVERLAP_RATIO)) # 32000
def load_audio(path: str, target_sr: int = SAMPLE_RATE) -> torch.Tensor:
"""Load a .flac file and return a 1-D mono waveform at target_sr."""
waveform, sr = torchaudio.load(path)
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
if sr != target_sr:
waveform = F.resample(waveform, orig_freq=sr, new_freq=target_sr)
return waveform.squeeze(0)
def segment_waveform(
waveform: torch.Tensor,
window_samples: int = WINDOW_SAMPLES,
hop_samples: int = HOP_SAMPLES,
) -> List[torch.Tensor]:
"""Split a 1-D waveform into fixed-length windows with overlap."""
n = waveform.shape[0]
if n <= window_samples:
padded = torch.zeros(window_samples, dtype=waveform.dtype)
padded[:n] = waveform
return [padded]
windows = []
start = 0
while start < n:
end = start + window_samples
if end <= n:
windows.append(waveform[start:end])
else:
tail = waveform[start:]
padded = torch.zeros(window_samples, dtype=waveform.dtype)
padded[:tail.shape[0]] = tail
windows.append(padded)
break
start += hop_samples
return windows
def preprocess(path: str) -> List[torch.Tensor]:
"""Full pipeline: load + window."""
waveform = load_audio(path)
return segment_waveform(waveform)