| |
| """Copy a handful of parallel TAU Mobile clips into an outtakes folder for manual testing. |
| |
| Usage: |
| python scripts/export_outtakes.py --count 5 --output data/outtakes |
| |
| The script searches under the training `data/` tree for clip prefixes that include |
| all three device suffixes (-a/-b/-c) and copies them (without removal) to the |
| chosen output directory. Each exported clip retains its original file name and |
| is accompanied by a `manifest.csv` describing the microphone mapping. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import csv |
| import random |
| import shutil |
| from collections import defaultdict |
| from pathlib import Path |
|
|
| MIC_SUFFIX_MAP = { |
| "a": ("Zoom F8 field recorder", "TAU device A"), |
| "b": ("Samsung Galaxy S7", "TAU device B"), |
| "c": ("iPhone SE", "TAU device C"), |
| } |
|
|
|
|
| def find_triplets(data_root: Path): |
| groups: dict[str, dict[str, Path]] = defaultdict(dict) |
| for wav_path in data_root.rglob("*.wav"): |
| name = wav_path.name |
| if len(name) < 6 or not name.endswith(".wav"): |
| continue |
| suffix = name[-5] |
| if suffix not in MIC_SUFFIX_MAP: |
| continue |
| prefix = name[:-6] |
| groups[prefix][suffix] = wav_path |
| |
| return {prefix: mapping for prefix, mapping in groups.items() if set(mapping) == set(MIC_SUFFIX_MAP)} |
|
|
|
|
| def export_triplets(groups: dict[str, dict[str, Path]], output_dir: Path, count: int, seed: int | None) -> list[tuple[str, str, Path]]: |
| if not groups: |
| return [] |
| rng = random.Random(seed) |
| prefixes = sorted(groups) |
| rng.shuffle(prefixes) |
| selected = prefixes[: min(count, len(prefixes))] |
| manifest_rows: list[tuple[str, str, Path]] = [] |
| output_dir.mkdir(parents=True, exist_ok=True) |
| for prefix in selected: |
| mapping = groups[prefix] |
| for suffix, path in mapping.items(): |
| dest = output_dir / path.name |
| shutil.copy2(path, dest) |
| manifest_rows.append((dest.name, suffix, path)) |
| return manifest_rows |
|
|
|
|
| def write_manifest(rows: list[tuple[str, str, Path]], manifest_path: Path) -> None: |
| if not rows: |
| return |
| with manifest_path.open("w", newline="") as f: |
| writer = csv.writer(f) |
| writer.writerow(["filename", "mic_suffix", "friendly_name", "description", "source_path"]) |
| for filename, suffix, source in rows: |
| friendly, description = MIC_SUFFIX_MAP[suffix] |
| writer.writerow([filename, suffix, friendly, description, str(source)]) |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser(description="Export a handful of TAU Mobile triplets for testing.") |
| parser.add_argument("--data-root", default="data", type=Path, help="Root directory containing per-device folders (default: data)") |
| parser.add_argument("--output", default="data/outtakes", type=Path, help="Where to copy the selected clips") |
| parser.add_argument("--count", type=int, default=5, help="Number of triplet prefixes to copy (default: 5)") |
| parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility") |
| args = parser.parse_args() |
|
|
| groups = find_triplets(args.data_root) |
| if not groups: |
| raise SystemExit("No complete triplets (-a/-b/-c) were found under the data root.") |
|
|
| rows = export_triplets(groups, args.output, args.count, args.seed) |
| if not rows: |
| raise SystemExit("Triplet export produced no files. Try lowering --count.") |
|
|
| write_manifest(rows, args.output / "manifest.csv") |
| print(f"Exported {len(rows)} wav files to {args.output} (covering {len(rows) // 3} triplets).") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|