| """ |
| Audio synthesis utilities for beat tracking evaluation. |
| |
| This module provides functions to: |
| - Generate click sounds for beats and downbeats |
| - Mix click tracks with original audio |
| - Save audio files with beat annotations |
| |
| Example usage: |
| from exp.data.audio import create_click_track, mix_audio, save_audio |
| |
| # Create click track |
| clicks = create_click_track( |
| beat_times=pred_beats, |
| downbeat_times=pred_downbeats, |
| duration=30.0, |
| sr=16000 |
| ) |
| |
| # Mix with original audio |
| mixed = mix_audio(original_audio, clicks, click_volume=0.5) |
| |
| # Save to file |
| save_audio(mixed, "output.wav", sr=16000) |
| """ |
|
|
| import numpy as np |
| from pathlib import Path |
|
|
|
|
| def generate_click( |
| frequency: float = 1000.0, |
| duration: float = 0.02, |
| sr: int = 16000, |
| attack: float = 0.002, |
| decay: float = 0.018, |
| ) -> np.ndarray: |
| """ |
| Generate a single click sound. |
| |
| Args: |
| frequency: Frequency of the click tone in Hz |
| duration: Duration of the click in seconds |
| sr: Sample rate |
| attack: Attack time in seconds |
| decay: Decay time in seconds |
| |
| Returns: |
| Click waveform as numpy array |
| """ |
| t = np.arange(int(duration * sr)) / sr |
|
|
| |
| wave = np.sin(2 * np.pi * frequency * t) |
|
|
| |
| envelope = np.ones_like(t) |
| attack_samples = int(attack * sr) |
| decay_samples = int(decay * sr) |
|
|
| if attack_samples > 0: |
| envelope[:attack_samples] = np.linspace(0, 1, attack_samples) |
| if decay_samples > 0: |
| decay_start = len(t) - decay_samples |
| if decay_start > 0: |
| envelope[decay_start:] = np.linspace(1, 0, decay_samples) |
|
|
| return wave * envelope |
|
|
|
|
| def create_click_track( |
| beat_times: list[float] | np.ndarray, |
| downbeat_times: list[float] | np.ndarray | None = None, |
| duration: float | None = None, |
| sr: int = 16000, |
| beat_freq: float = 1000.0, |
| downbeat_freq: float = 1500.0, |
| click_duration: float = 0.03, |
| ) -> np.ndarray: |
| """ |
| Create a click track from beat and downbeat times. |
| |
| Args: |
| beat_times: List of beat times in seconds |
| downbeat_times: List of downbeat times in seconds (optional) |
| duration: Total duration in seconds (auto-detected if None) |
| sr: Sample rate |
| beat_freq: Frequency for beat clicks (Hz) |
| downbeat_freq: Frequency for downbeat clicks (Hz) |
| click_duration: Duration of each click in seconds |
| |
| Returns: |
| Click track as numpy array |
| """ |
| beat_times = np.array(beat_times) if len(beat_times) > 0 else np.array([]) |
| if downbeat_times is not None: |
| downbeat_times = ( |
| np.array(downbeat_times) if len(downbeat_times) > 0 else np.array([]) |
| ) |
| else: |
| downbeat_times = np.array([]) |
|
|
| |
| if duration is None: |
| all_times = np.concatenate([beat_times, downbeat_times]) |
| if len(all_times) == 0: |
| return np.array([]) |
| duration = float(np.max(all_times)) + 1.0 |
|
|
| |
| total_samples = int(duration * sr) |
| output = np.zeros(total_samples, dtype=np.float32) |
|
|
| |
| beat_click = generate_click(frequency=beat_freq, duration=click_duration, sr=sr) |
| downbeat_click = generate_click( |
| frequency=downbeat_freq, duration=click_duration, sr=sr |
| ) |
|
|
| |
| downbeat_set = set(np.round(downbeat_times, 3)) |
|
|
| |
| for t in beat_times: |
| sample_idx = int(t * sr) |
| if sample_idx < 0 or sample_idx >= total_samples: |
| continue |
|
|
| |
| is_downbeat = np.round(t, 3) in downbeat_set |
| click = downbeat_click if is_downbeat else beat_click |
|
|
| |
| end_idx = min(sample_idx + len(click), total_samples) |
| click_len = end_idx - sample_idx |
| output[sample_idx:end_idx] += click[:click_len] |
|
|
| |
| beat_set = set(np.round(beat_times, 3)) |
| for t in downbeat_times: |
| if np.round(t, 3) in beat_set: |
| continue |
|
|
| sample_idx = int(t * sr) |
| if sample_idx < 0 or sample_idx >= total_samples: |
| continue |
|
|
| end_idx = min(sample_idx + len(downbeat_click), total_samples) |
| click_len = end_idx - sample_idx |
| output[sample_idx:end_idx] += downbeat_click[:click_len] |
|
|
| return output |
|
|
|
|
| def mix_audio( |
| audio: np.ndarray, |
| click_track: np.ndarray, |
| click_volume: float = 0.5, |
| ) -> np.ndarray: |
| """ |
| Mix original audio with a click track. |
| |
| Args: |
| audio: Original audio waveform |
| click_track: Click track to overlay |
| click_volume: Volume of clicks relative to audio (0.0 to 1.0) |
| |
| Returns: |
| Mixed audio |
| """ |
| |
| max_len = max(len(audio), len(click_track)) |
| audio_padded = np.zeros(max_len, dtype=np.float32) |
| click_padded = np.zeros(max_len, dtype=np.float32) |
|
|
| audio_padded[: len(audio)] = audio |
| click_padded[: len(click_track)] = click_track |
|
|
| |
| audio_max = np.abs(audio_padded).max() |
| if audio_max > 0: |
| audio_padded = audio_padded / audio_max * 0.8 |
|
|
| |
| click_max = np.abs(click_padded).max() |
| if click_max > 0: |
| click_padded = click_padded / click_max * click_volume * 0.8 |
|
|
| |
| mixed = audio_padded + click_padded |
|
|
| |
| max_val = np.abs(mixed).max() |
| if max_val > 1.0: |
| mixed = mixed / max_val * 0.95 |
|
|
| return mixed.astype(np.float32) |
|
|
|
|
| def create_comparison_audio( |
| audio: np.ndarray, |
| pred_beats: list[float], |
| pred_downbeats: list[float], |
| gt_beats: list[float], |
| gt_downbeats: list[float], |
| sr: int = 16000, |
| click_volume: float = 0.5, |
| ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: |
| """ |
| Create audio files for comparison: prediction clicks, ground truth clicks, and combined. |
| |
| Args: |
| audio: Original audio waveform |
| pred_beats: Predicted beat times |
| pred_downbeats: Predicted downbeat times |
| gt_beats: Ground truth beat times |
| gt_downbeats: Ground truth downbeat times |
| sr: Sample rate |
| click_volume: Volume of clicks |
| |
| Returns: |
| Tuple of (audio_with_pred_clicks, audio_with_gt_clicks, audio_with_both) |
| """ |
| duration = len(audio) / sr |
|
|
| |
| pred_clicks = create_click_track( |
| pred_beats, |
| pred_downbeats, |
| duration=duration, |
| sr=sr, |
| beat_freq=1000.0, |
| downbeat_freq=1500.0, |
| ) |
|
|
| gt_clicks = create_click_track( |
| gt_beats, |
| gt_downbeats, |
| duration=duration, |
| sr=sr, |
| beat_freq=800.0, |
| downbeat_freq=1200.0, |
| ) |
|
|
| |
| audio_pred = mix_audio(audio, pred_clicks, click_volume) |
| audio_gt = mix_audio(audio, gt_clicks, click_volume) |
| audio_both = mix_audio(audio, pred_clicks + gt_clicks, click_volume) |
|
|
| return audio_pred, audio_gt, audio_both |
|
|
|
|
| def save_audio( |
| audio: np.ndarray, |
| path: str | Path, |
| sr: int = 16000, |
| ) -> None: |
| """ |
| Save audio to a WAV file. |
| |
| Args: |
| audio: Audio waveform |
| path: Output file path |
| sr: Sample rate |
| """ |
| import scipy.io.wavfile as wavfile |
|
|
| path = Path(path) |
| path.parent.mkdir(parents=True, exist_ok=True) |
|
|
| |
| audio_int16 = (audio * 32767).astype(np.int16) |
| wavfile.write(str(path), sr, audio_int16) |
|
|
|
|
| if __name__ == "__main__": |
| |
| print("Audio synthesis demo...") |
|
|
| |
| sr = 16000 |
| duration = 10.0 |
| t = np.arange(int(duration * sr)) / sr |
| music = np.sin(2 * np.pi * 220 * t) * 0.3 |
|
|
| |
| beats = np.arange(0, duration, 0.5).tolist() |
| downbeats = np.arange(0, duration, 2.0).tolist() |
|
|
| |
| clicks = create_click_track(beats, downbeats, duration=duration, sr=sr) |
|
|
| |
| mixed = mix_audio(music, clicks, click_volume=0.6) |
|
|
| print(f"Created mixed audio: {len(mixed)} samples ({len(mixed) / sr:.2f}s)") |
| print(f"Beats: {len(beats)}, Downbeats: {len(downbeats)}") |
|
|
| |
| save_audio(mixed, "/tmp/beat_click_demo.wav", sr=sr) |
| print("Saved demo to /tmp/beat_click_demo.wav") |
|
|