Spaces:

Sara1708
/

deepfake-audio-detector

Running

deepfake-audio-detector / src /data /preprocessing.py

Saracasm

Phase 2: add preprocessing and dataset modules

b95ff5a 14 days ago

1.99 kB

	"""
	Audio preprocessing for ASVspoof 2019 LA.

	Given a .flac path, produces a list of fixed-length 4-second segments at 16kHz
	mono, ready to feed into Wav2Vec 2.0.

	Pipeline:
	load .flac -> ensure mono -> ensure 16kHz -> window with 50% overlap
	(short clips are zero-padded to one full window)
	"""

	from typing import List
	import torch
	import torchaudio
	import torchaudio.functional as F


	SAMPLE_RATE = 16000
	WINDOW_SECONDS = 4.0
	OVERLAP_RATIO = 0.5

	WINDOW_SAMPLES = int(SAMPLE_RATE * WINDOW_SECONDS) # 64000
	HOP_SAMPLES = int(WINDOW_SAMPLES * (1.0 - OVERLAP_RATIO)) # 32000


	def load_audio(path: str, target_sr: int = SAMPLE_RATE) -> torch.Tensor:
	"""Load a .flac file and return a 1-D mono waveform at target_sr."""
	waveform, sr = torchaudio.load(path)
	if waveform.shape[0] > 1:
	waveform = waveform.mean(dim=0, keepdim=True)
	if sr != target_sr:
	waveform = F.resample(waveform, orig_freq=sr, new_freq=target_sr)
	return waveform.squeeze(0)


	def segment_waveform(
	waveform: torch.Tensor,
	window_samples: int = WINDOW_SAMPLES,
	hop_samples: int = HOP_SAMPLES,
	) -> List[torch.Tensor]:
	"""Split a 1-D waveform into fixed-length windows with overlap."""
	n = waveform.shape[0]
	if n <= window_samples:
	padded = torch.zeros(window_samples, dtype=waveform.dtype)
	padded[:n] = waveform
	return [padded]

	windows = []
	start = 0
	while start < n:
	end = start + window_samples
	if end <= n:
	windows.append(waveform[start:end])
	else:
	tail = waveform[start:]
	padded = torch.zeros(window_samples, dtype=waveform.dtype)
	padded[:tail.shape[0]] = tail
	windows.append(padded)
	break
	start += hop_samples
	return windows


	def preprocess(path: str) -> List[torch.Tensor]:
	"""Full pipeline: load + window."""
	waveform = load_audio(path)
	return segment_waveform(waveform)