File size: 1,990 Bytes
b95ff5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""
Audio preprocessing for ASVspoof 2019 LA.

Given a .flac path, produces a list of fixed-length 4-second segments at 16kHz
mono, ready to feed into Wav2Vec 2.0.

Pipeline:
    load .flac -> ensure mono -> ensure 16kHz -> window with 50% overlap
    (short clips are zero-padded to one full window)
"""

from typing import List
import torch
import torchaudio
import torchaudio.functional as F


SAMPLE_RATE = 16000
WINDOW_SECONDS = 4.0
OVERLAP_RATIO = 0.5

WINDOW_SAMPLES = int(SAMPLE_RATE * WINDOW_SECONDS)              # 64000
HOP_SAMPLES = int(WINDOW_SAMPLES * (1.0 - OVERLAP_RATIO))       # 32000


def load_audio(path: str, target_sr: int = SAMPLE_RATE) -> torch.Tensor:
    """Load a .flac file and return a 1-D mono waveform at target_sr."""
    waveform, sr = torchaudio.load(path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    if sr != target_sr:
        waveform = F.resample(waveform, orig_freq=sr, new_freq=target_sr)
    return waveform.squeeze(0)


def segment_waveform(
    waveform: torch.Tensor,
    window_samples: int = WINDOW_SAMPLES,
    hop_samples: int = HOP_SAMPLES,
) -> List[torch.Tensor]:
    """Split a 1-D waveform into fixed-length windows with overlap."""
    n = waveform.shape[0]
    if n <= window_samples:
        padded = torch.zeros(window_samples, dtype=waveform.dtype)
        padded[:n] = waveform
        return [padded]

    windows = []
    start = 0
    while start < n:
        end = start + window_samples
        if end <= n:
            windows.append(waveform[start:end])
        else:
            tail = waveform[start:]
            padded = torch.zeros(window_samples, dtype=waveform.dtype)
            padded[:tail.shape[0]] = tail
            windows.append(padded)
            break
        start += hop_samples
    return windows


def preprocess(path: str) -> List[torch.Tensor]:
    """Full pipeline: load + window."""
    waveform = load_audio(path)
    return segment_waveform(waveform)