techfreakworm's picture
feat(audio): wav validation, write helper, mono/16k normalization
ca78147 unverified
"""Audio I/O and validation utilities."""
from __future__ import annotations
import io
from dataclasses import dataclass
import numpy as np
import soundfile as sf
MIN_DURATION_S = 0.5
MAX_DURATION_S = 60.0
MIN_SAMPLE_RATE = 16000
class AudioValidationError(ValueError):
"""Raised when a reference clip fails validation."""
@dataclass(frozen=True)
class ClipInfo:
duration_s: float
sample_rate: int
channels: int
def validate_reference_clip(wav_bytes: bytes) -> ClipInfo:
try:
with sf.SoundFile(io.BytesIO(wav_bytes)) as f:
sample_rate = f.samplerate
channels = f.channels
frames = f.frames
except Exception as exc:
raise AudioValidationError(f"invalid audio format: {exc}") from exc
duration_s = frames / float(sample_rate) if sample_rate else 0.0
if sample_rate < MIN_SAMPLE_RATE:
raise AudioValidationError(
f"sample rate {sample_rate} below minimum {MIN_SAMPLE_RATE}"
)
if duration_s < MIN_DURATION_S:
raise AudioValidationError(f"clip too short ({duration_s:.2f}s)")
if duration_s > MAX_DURATION_S:
raise AudioValidationError(f"clip too long ({duration_s:.2f}s)")
return ClipInfo(duration_s=duration_s, sample_rate=sample_rate, channels=channels)
def write_wav_bytes(samples: np.ndarray, sample_rate: int) -> bytes:
buf = io.BytesIO()
sf.write(buf, samples, sample_rate, format="WAV", subtype="PCM_16")
return buf.getvalue()
def normalize_to_mono_16k(
samples: np.ndarray, original_sr: int, target_sr: int = 16000
) -> tuple[np.ndarray, int]:
"""Downmix to mono and naive linear resample to target_sr."""
if samples.ndim == 2:
samples = samples.mean(axis=1)
if original_sr == target_sr:
return samples.astype(np.float32), target_sr
duration = samples.shape[0] / float(original_sr)
target_len = int(round(duration * target_sr))
x_old = np.linspace(0.0, 1.0, samples.shape[0], endpoint=False)
x_new = np.linspace(0.0, 1.0, target_len, endpoint=False)
out = np.interp(x_new, x_old, samples).astype(np.float32)
return out, target_sr