connork commited on
Commit ·
de9c0fe
0
Parent(s):
Populate Streamlit Space
Browse files- .gitattributes +38 -0
- .gitignore +8 -0
- README.md +150 -0
- app.py +187 -0
- data/audio/airport-helsinki-204-6138-a.wav +3 -0
- data/audio/airport-lisbon-175-4700-a.wav +3 -0
- data/audio2/bus-stockholm-35-1041-a.wav +3 -0
- data/audio2/bus-stockholm-35-1041-b.wav +3 -0
- data/audio9/street_pedestrian-london-149-4500-c.wav +3 -0
- data/iphone/clip_01.wav +3 -0
- data/iphone/clip_05.wav +3 -0
- data/laptop/clip_01.wav +3 -0
- data/laptop/clip_05.wav +3 -0
- data/outtakes/manifest.csv +13 -0
- data/outtakes/metro-paris-50-1524-a.wav +3 -0
- data/outtakes/metro-paris-50-1524-b.wav +3 -0
- data/outtakes/metro-paris-50-1524-c.wav +3 -0
- devices.py +12 -0
- docs/data-sourcing.md +65 -0
- features.py +34 -0
- models/label_encoder.pkl +3 -0
- models/model.pkl +3 -0
- packages.txt +2 -0
- predict.py +86 -0
- reports/confusion_matrix.png +3 -0
- reports/metrics.json +45 -0
- requirements.txt +9 -0
- scripts/export_outtakes.py +94 -0
- train.py +104 -0
- utils.py +21 -0
.gitattributes
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
models/*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
reports/*.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
data/**/*.wav filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
.cache/
|
| 3 |
+
.numba_cache/
|
| 4 |
+
.matplotlib_cache/
|
| 5 |
+
.mpl-cache/
|
| 6 |
+
.DS_Store
|
| 7 |
+
uploads/
|
| 8 |
+
.tmp/
|
README.md
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Mic-ID
|
| 3 |
+
emoji: "🎙️"
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: "1.31.1"
|
| 8 |
+
python_version: "3.10"
|
| 9 |
+
app_file: app.py
|
| 10 |
+
pinned: false
|
| 11 |
+
license: mit
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# Mic-ID
|
| 15 |
+
|
| 16 |
+
 
|
| 17 |
+
|
| 18 |
+
A Streamlit front-end around a microphone fingerprinting baseline: drop in a short clip, get the most likely capture device plus an optional tonal hint. 🎙️ Built for quick lab demos, perfect for showing off how far classic features still go.
|
| 19 |
+
|
| 20 |
+
## Table of Contents
|
| 21 |
+
- [Highlights](#highlights)
|
| 22 |
+
- [Live Demo Flow](#live-demo-flow)
|
| 23 |
+
- [Quick Start](#quick-start)
|
| 24 |
+
- [Controls at a Glance](#controls-at-a-glance)
|
| 25 |
+
- [Device Recognition](#device-recognition)
|
| 26 |
+
- [Scale Detection](#scale-detection)
|
| 27 |
+
- [Bundled Example Clips](#bundled-example-clips)
|
| 28 |
+
- [Download Contents](#download-contents)
|
| 29 |
+
- [Testing](#testing)
|
| 30 |
+
- [Project Layout](#project-layout)
|
| 31 |
+
- [Roadmap](#roadmap)
|
| 32 |
+
- [Contributing](#contributing)
|
| 33 |
+
|
| 34 |
+
## Highlights
|
| 35 |
+
- 🔎 End-to-end workflow for collecting, training, and demoing mic classification in one repo.
|
| 36 |
+
- 🎛️ Feature-first approach: log-mel, MFCC, and spectral stats feed a histogram gradient boosting model.
|
| 37 |
+
- 🧠 Friendly predictions: class IDs map to real device names so you can narrate results without decoding labels.
|
| 38 |
+
- 🗂️ Lightweight artefacts: plain `.wav` folders in `data/`, pickled models in `models/`, metrics and confusion heatmaps in `reports/`.
|
| 39 |
+
- ⚙️ Streamlit UI mirrors the CLI helpers, including loudness normalisation and experimental scale read-outs.
|
| 40 |
+
|
| 41 |
+
## Live Demo Flow
|
| 42 |
+
If you are running a live session, keep this script handy:
|
| 43 |
+
|
| 44 |
+
- 🎧 `streamlit run app.py` from the project root.
|
| 45 |
+
- 📂 Use `data/audio/airport-helsinki-204-6138-a.wav` to introduce the core upload flow and the default top-3 guess list.
|
| 46 |
+
- 🔄 Swap to `data/audio/airport-helsinki-204-6138-b.wav` or `data/audio/airport-helsinki-204-6138-c.wav` to highlight how the twin scene shifts the predicted device while the environment stays constant.
|
| 47 |
+
- 📱 Jump to `data/iphone/clip_05.wav` to show the locally recorded class and talk about adding in-house gear with `utils.py`.
|
| 48 |
+
- 📊 Mention the probability bar chart and the saved copy under `uploads/hooks - <filename>` for later analysis.
|
| 49 |
+
|
| 50 |
+
## Quick Start
|
| 51 |
+
⚡ Four commands set everything up:
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
python3 -m venv .venv
|
| 55 |
+
source .venv/bin/activate
|
| 56 |
+
pip install -r requirements.txt
|
| 57 |
+
python train.py # optional if you want to refresh the model
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
Then launch the app with `streamlit run app.py` (defaults to http://localhost:8501).
|
| 61 |
+
|
| 62 |
+
## Hugging Face Space Setup
|
| 63 |
+
Want a hosted demo? This repo is ready to drop into a Hugging Face Space using the Streamlit SDK. The short version:
|
| 64 |
+
|
| 65 |
+
1. `pip install -U "huggingface_hub[cli]"` and run `huggingface-cli login` with a write-scoped access token.
|
| 66 |
+
2. `git clone` your Space (for example `https://huggingface.co/spaces/connaaa/mic-id`) into an empty folder.
|
| 67 |
+
3. Copy the contents of this repository into that clone, keeping `README.md`, `app.py`, `requirements.txt`, `packages.txt`, `models/`, and the curated `data/` subsets you want online.
|
| 68 |
+
4. Commit and `git push`. The Space will build the dependencies listed in `requirements.txt` plus Debian packages from `packages.txt`.
|
| 69 |
+
|
| 70 |
+
Large training corpora can be trimmed before pushing if you only need the pretrained model for inference.
|
| 71 |
+
|
| 72 |
+
## Controls at a Glance
|
| 73 |
+
| Control | Default | What it does |
|
| 74 |
+
| --- | --- | --- |
|
| 75 |
+
| File uploader | – | Accepts WAV/MP3/M4A, converts to 16 kHz mono, and normalises loudness before scoring. |
|
| 76 |
+
| `How many guesses should we list?` slider | 3 | Sets the length of the ranked prediction list and bar chart. |
|
| 77 |
+
| Training data expander | Collapsed | Recaps which datasets went into the current checkpoint, handy during demos. |
|
| 78 |
+
| Prediction pane | Auto | Shows the tonal estimate (if any), RMS loudness, ranked devices, and probability chart. |
|
| 79 |
+
|
| 80 |
+
Each control includes inline help text so presenters can improvise without notes.
|
| 81 |
+
|
| 82 |
+
## Device Recognition
|
| 83 |
+
- 🧱 Audio flows through `features.extract_features`, stitching log-mel and MFCC statistics with zero-crossing, centroid, roll-off, and flatness cues.
|
| 84 |
+
- 🌲 `train.py` fits a `HistGradientBoostingClassifier`, stratified split, and saves artefacts to `models/model.pkl` plus the label encoder.
|
| 85 |
+
- 📈 Every training run exports `reports/metrics.json` and `reports/confusion_matrix.png` so you can cite precision/recall live.
|
| 86 |
+
- 🏷️ The app and CLI surface friendly names (e.g. “Zoom F8 field recorder”) pulled from `devices.describe_label()` to keep the story human-readable.
|
| 87 |
+
|
| 88 |
+
## Scale Detection
|
| 89 |
+
- 🎼 Uses a simple `librosa` chroma profile match across all major/minor keys.
|
| 90 |
+
- ✅ High confidence (≥ 0.6) renders a green highlight, 0.4–0.6 shows an amber “low confidence” tag, and anything lower hides the scale suggestion entirely.
|
| 91 |
+
- 🥁 Purely percussive or noisy clips skip the tonal hint, which is exactly what you want for location recordings.
|
| 92 |
+
|
| 93 |
+
## Bundled Example Clips
|
| 94 |
+
All sample audio lives under `data/` and mirrors the device IDs referenced in the demo.
|
| 95 |
+
|
| 96 |
+
| Folder | What it represents | Count* |
|
| 97 |
+
| --- | --- | --- |
|
| 98 |
+
| `audio/` | TAU Urban Acoustic Scenes clips (device A) – Zoom F8 field recorder | 3 · demo bundle |
|
| 99 |
+
| `audio2/` | TAU Urban Acoustic Scenes clips (device B) – Samsung Galaxy S7 | 2 · demo bundle |
|
| 100 |
+
| `audio9/` | TAU Urban Acoustic Scenes clips (device C) – iPhone SE | 1 · demo bundle |
|
| 101 |
+
| `iphone/` | Locally recorded iPhone speech snippets captured with `utils.py` | 2 |
|
| 102 |
+
| `laptop/` | MacBook built-in mic samples recorded in a treated room | 2 |
|
| 103 |
+
| `outtakes/` | Extra captures you can promote into training data after curation | 3 · demo bundle |
|
| 104 |
+
|
| 105 |
+
⋆The Space ships with a travel-sized sample set; pull the full dataset locally if you want to retrain the checkpoint.
|
| 106 |
+
|
| 107 |
+
## Download Contents
|
| 108 |
+
Every run generates artefacts you can drop into a slide deck or share with collaborators:
|
| 109 |
+
|
| 110 |
+
- 🎯 `models/model.pkl` and `models/label_encoder.pkl` store the trained classifier and label map.
|
| 111 |
+
- 📊 `reports/metrics.json` plus `reports/confusion_matrix.png` capture evaluation snapshots for the latest training session.
|
| 112 |
+
- 📁 Uploaded clips are preserved under `uploads/hooks - <original-name>` so you can replay or re-label them later.
|
| 113 |
+
|
| 114 |
+
## Testing
|
| 115 |
+
Quick smoke checks live in the scripts themselves:
|
| 116 |
+
|
| 117 |
+
```bash
|
| 118 |
+
# Rebuild the model and metrics
|
| 119 |
+
python train.py
|
| 120 |
+
|
| 121 |
+
# Score a few clips and verify probabilities look sane
|
| 122 |
+
python predict.py data/laptop/clip_01.wav data/iphone/clip_05.wav --topk 5
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
For deeper regression coverage, wire these commands into your CI and compare the resulting metrics JSON against previous baselines.
|
| 126 |
+
|
| 127 |
+
## Project Layout
|
| 128 |
+
```
|
| 129 |
+
mic-id/
|
| 130 |
+
├─ app.py # Streamlit UI for uploading and scoring clips
|
| 131 |
+
├─ predict.py # CLI scorer with friendly device names
|
| 132 |
+
├─ train.py # Dataset loader, model trainer, metric exporter
|
| 133 |
+
├─ features.py # Audio feature extraction helpers
|
| 134 |
+
├─ utils.py # Command-line recorder for new device samples
|
| 135 |
+
├─ data/ # Per-device waveforms (TAU + local recordings)
|
| 136 |
+
├─ models/ # Saved classifier + label encoder
|
| 137 |
+
├─ reports/ # Metrics JSON and confusion matrix plots
|
| 138 |
+
├─ docs/ # Data sourcing guide and prep notes
|
| 139 |
+
├─ scripts/ # Dataset preparation helpers (TAU, Freesound, etc.)
|
| 140 |
+
└─ uploads/ # Cached demo uploads saved by the Streamlit app
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
## Roadmap
|
| 144 |
+
- 🛰️ Add a lightweight CNN baseline alongside the gradient boosting model for comparison.
|
| 145 |
+
- 🧪 Ship augmentation scripts (noise, EQ, impulse responses) to spotlight microphone colouration differences.
|
| 146 |
+
- 🔐 Bundle provenance metadata (`data/metadata.csv`) and automated integrity checks for new clips.
|
| 147 |
+
- 📦 Polish export helpers so the app can bundle probabilities + features in one download.
|
| 148 |
+
|
| 149 |
+
## Contributing
|
| 150 |
+
Issues and pull requests are welcome. 🤝 If you contribute new devices, include a short note (or a `metadata.csv` entry) describing the capture setup so others can reproduce your results and audit licensing.
|
app.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
import os
|
| 3 |
+
import tempfile
|
| 4 |
+
import joblib
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import streamlit as st
|
| 8 |
+
import librosa
|
| 9 |
+
import librosa.display
|
| 10 |
+
import matplotlib.pyplot as plt
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
BASE_DIR = Path(__file__).resolve().parent
|
| 14 |
+
CACHE_ROOT = BASE_DIR / ".cache"
|
| 15 |
+
NUMBA_CACHE_DIR = CACHE_ROOT / "numba"
|
| 16 |
+
MPL_CACHE_DIR = CACHE_ROOT / "matplotlib"
|
| 17 |
+
for cache_dir in (NUMBA_CACHE_DIR, MPL_CACHE_DIR):
|
| 18 |
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
| 19 |
+
os.environ.setdefault("NUMBA_CACHE_DIR", str(NUMBA_CACHE_DIR))
|
| 20 |
+
os.environ.setdefault("MPLCONFIGDIR", str(MPL_CACHE_DIR))
|
| 21 |
+
|
| 22 |
+
from features import extract_features
|
| 23 |
+
from devices import describe_label
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
NOTE_NAMES = [
|
| 27 |
+
"C",
|
| 28 |
+
"C#",
|
| 29 |
+
"D",
|
| 30 |
+
"D#",
|
| 31 |
+
"E",
|
| 32 |
+
"F",
|
| 33 |
+
"F#",
|
| 34 |
+
"G",
|
| 35 |
+
"G#",
|
| 36 |
+
"A",
|
| 37 |
+
"A#",
|
| 38 |
+
"B",
|
| 39 |
+
]
|
| 40 |
+
MAJOR_PROFILE = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
|
| 41 |
+
MINOR_PROFILE = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
|
| 42 |
+
UPLOAD_DIR = BASE_DIR / "uploads"
|
| 43 |
+
MODEL_PATH = BASE_DIR / "models" / "model.pkl"
|
| 44 |
+
ENCODER_PATH = BASE_DIR / "models" / "label_encoder.pkl"
|
| 45 |
+
UPLOAD_DIR.mkdir(exist_ok=True)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def estimate_scale(y: np.ndarray, sr: int) -> str | None:
|
| 49 |
+
"""Return a rough musical scale (e.g., 'C major') or None if unclear."""
|
| 50 |
+
|
| 51 |
+
if y.size == 0:
|
| 52 |
+
return None
|
| 53 |
+
chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
|
| 54 |
+
if chroma.size == 0:
|
| 55 |
+
return None
|
| 56 |
+
chroma_mean = chroma.mean(axis=1)
|
| 57 |
+
norm = np.linalg.norm(chroma_mean, ord=1)
|
| 58 |
+
if norm == 0:
|
| 59 |
+
return None
|
| 60 |
+
chroma_mean = chroma_mean / norm
|
| 61 |
+
|
| 62 |
+
major_scores = [float(np.dot(chroma_mean, np.roll(MAJOR_PROFILE, i))) for i in range(12)]
|
| 63 |
+
minor_scores = [float(np.dot(chroma_mean, np.roll(MINOR_PROFILE, i))) for i in range(12)]
|
| 64 |
+
|
| 65 |
+
best_major = int(np.argmax(major_scores))
|
| 66 |
+
best_minor = int(np.argmax(minor_scores))
|
| 67 |
+
best_major_score = major_scores[best_major]
|
| 68 |
+
best_minor_score = minor_scores[best_minor]
|
| 69 |
+
best_score = max(best_major_score, best_minor_score)
|
| 70 |
+
|
| 71 |
+
# Require a minimal tonal structure to avoid spurious guesses on noise.
|
| 72 |
+
if best_score < 0.3:
|
| 73 |
+
return None
|
| 74 |
+
|
| 75 |
+
if best_major_score >= best_minor_score:
|
| 76 |
+
return f"{NOTE_NAMES[best_major]} major"
|
| 77 |
+
return f"{NOTE_NAMES[best_minor]} minor"
|
| 78 |
+
|
| 79 |
+
st.set_page_config(page_title="Mic-ID (MVP)", layout="centered")
|
| 80 |
+
st.title("Mic-ID (MVP)")
|
| 81 |
+
st.caption("Upload ~5s audio → guess the recording device (demo)")
|
| 82 |
+
|
| 83 |
+
with st.expander("Training data & devices", expanded=False):
|
| 84 |
+
st.markdown(
|
| 85 |
+
"""
|
| 86 |
+
- **TAU Urban Acoustic Scenes 2019 Mobile**: 295 parallel scenes where the same moment was captured on three devices – Zoom F8 (device A, clips ending in `-a`), Samsung Galaxy S7 (device B, `-b`), and iPhone SE (device C, `-c`). We only keep folders containing a full `-a/-b/-c` triplet, so each mic has 295 clips.
|
| 87 |
+
- **Local additions**: 4 laptop and 4 iPhone recordings collected with `utils.py` to anchor the classifier on in-house gear.
|
| 88 |
+
- **Features & model**: log-mel + MFCC statistics flow into a histogram-based gradient boosting classifier tuned for this small balanced set.
|
| 89 |
+
|
| 90 |
+
Want more coverage? Record new clips under `data/<device>/` or export outtakes with `scripts/export_outtakes.py` before retraining via `python train.py`.
|
| 91 |
+
"""
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
@st.cache_resource
|
| 96 |
+
def load_model():
|
| 97 |
+
try:
|
| 98 |
+
clf = joblib.load(MODEL_PATH)
|
| 99 |
+
le = joblib.load(ENCODER_PATH)
|
| 100 |
+
return clf, le
|
| 101 |
+
except Exception as exc:
|
| 102 |
+
st.warning(f"Could not load trained artefacts: {exc}")
|
| 103 |
+
return None, None
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
clf, le = load_model()
|
| 107 |
+
topk = None
|
| 108 |
+
if clf and le is not None:
|
| 109 |
+
max_classes = max(1, len(le.classes_))
|
| 110 |
+
default_topk = min(3, max_classes)
|
| 111 |
+
topk = st.slider(
|
| 112 |
+
"How many guesses should we list?",
|
| 113 |
+
min_value=1,
|
| 114 |
+
max_value=max_classes,
|
| 115 |
+
value=default_topk,
|
| 116 |
+
help="Slide right to show more of the lower-confidence device guesses.",
|
| 117 |
+
)
|
| 118 |
+
st.caption("The slider above only changes how many ranked predictions you see.")
|
| 119 |
+
|
| 120 |
+
file = st.file_uploader("Upload WAV/MP3/M4A", type=["wav","mp3","m4a"])
|
| 121 |
+
|
| 122 |
+
if file and clf and le is not None:
|
| 123 |
+
data = file.read()
|
| 124 |
+
original_name = Path(file.name or "upload").name
|
| 125 |
+
renamed_name = f"hooks - {original_name}"
|
| 126 |
+
saved_path = UPLOAD_DIR / renamed_name
|
| 127 |
+
saved_path.write_bytes(data)
|
| 128 |
+
st.caption(f"Saved a copy as `{saved_path}`.")
|
| 129 |
+
try:
|
| 130 |
+
y, sr = librosa.load(io.BytesIO(data), sr=16000, mono=True)
|
| 131 |
+
except Exception:
|
| 132 |
+
suffix = os.path.splitext(file.name or "upload")[1] or ".wav"
|
| 133 |
+
with tempfile.NamedTemporaryFile(suffix=suffix) as tmp:
|
| 134 |
+
tmp.write(data)
|
| 135 |
+
tmp.flush()
|
| 136 |
+
y, sr = librosa.load(tmp.name, sr=16000, mono=True)
|
| 137 |
+
raw_y = y.copy()
|
| 138 |
+
rms = np.sqrt(np.mean(raw_y**2)) + 1e-8
|
| 139 |
+
scale = estimate_scale(raw_y, sr)
|
| 140 |
+
y = raw_y * (0.05 / rms) # simple RMS norm
|
| 141 |
+
feats = extract_features(y, 16000).reshape(1, -1)
|
| 142 |
+
proba = clf.predict_proba(feats)[0]
|
| 143 |
+
idx = np.argsort(proba)[::-1]
|
| 144 |
+
st.subheader("Prediction")
|
| 145 |
+
if scale:
|
| 146 |
+
st.write(f"Estimated scale: **{scale}** (experimental)")
|
| 147 |
+
else:
|
| 148 |
+
st.write("Scale detection: the clip lacked clear musical content, so no scale estimate.")
|
| 149 |
+
st.write(f"Input loudness (RMS): {20 * np.log10(rms + 1e-12):.1f} dBFS")
|
| 150 |
+
limit = topk or 3
|
| 151 |
+
for i in idx[:limit]:
|
| 152 |
+
label = le.classes_[i]
|
| 153 |
+
st.write(f"{describe_label(label)} — **{proba[i] * 100:.1f}%**")
|
| 154 |
+
friendly_index = [describe_label(label) for label in le.classes_]
|
| 155 |
+
st.bar_chart(pd.Series(proba, index=friendly_index))
|
| 156 |
+
|
| 157 |
+
with st.expander("How the model listens", expanded=False):
|
| 158 |
+
st.markdown(
|
| 159 |
+
"We tidy the audio (level it, pull out key frequencies) and let the classifier score that summary. These charts show the raw waveform and the energy heatmap the model uses to decide."
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
duration = raw_y.size / sr if raw_y.size else 0
|
| 163 |
+
times = np.linspace(0.0, duration, num=raw_y.size, endpoint=False) if raw_y.size else np.array([])
|
| 164 |
+
|
| 165 |
+
fig_wave, ax_wave = plt.subplots(figsize=(6, 2))
|
| 166 |
+
if raw_y.size:
|
| 167 |
+
ax_wave.plot(times, raw_y, linewidth=0.8, color="#1f77b4")
|
| 168 |
+
ax_wave.set_xlim(0, max(times) if raw_y.size else 0)
|
| 169 |
+
ax_wave.set_title("Waveform (time vs amplitude)")
|
| 170 |
+
ax_wave.set_xlabel("Time (s)")
|
| 171 |
+
ax_wave.set_ylabel("Amplitude")
|
| 172 |
+
ax_wave.grid(alpha=0.2)
|
| 173 |
+
st.pyplot(fig_wave, use_container_width=True)
|
| 174 |
+
plt.close(fig_wave)
|
| 175 |
+
|
| 176 |
+
mel = librosa.feature.melspectrogram(y=raw_y, sr=sr, n_fft=2048, hop_length=512, n_mels=64)
|
| 177 |
+
mel_db = librosa.power_to_db(mel, ref=np.max) if mel.size else mel
|
| 178 |
+
fig_spec, ax_spec = plt.subplots(figsize=(6, 3))
|
| 179 |
+
if mel.size:
|
| 180 |
+
img = librosa.display.specshow(mel_db, sr=sr, hop_length=512, x_axis="time", y_axis="mel", ax=ax_spec)
|
| 181 |
+
cbar = fig_spec.colorbar(img, ax=ax_spec, format="%+2.f dB")
|
| 182 |
+
cbar.set_label("Energy (dB)")
|
| 183 |
+
ax_spec.set_title("Log-mel spectrogram (what the model summarises)")
|
| 184 |
+
st.pyplot(fig_spec, use_container_width=True)
|
| 185 |
+
plt.close(fig_spec)
|
| 186 |
+
elif file and not clf:
|
| 187 |
+
st.warning("No trained model found. Run `python train.py` first.")
|
data/audio/airport-helsinki-204-6138-a.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db356757394c3ed66d87990ed98a080b1a1a1778aaaffe110b723c9fbd294814
|
| 3 |
+
size 1323044
|
data/audio/airport-lisbon-175-4700-a.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebcea04001ff88fd63af3e76d153ae2d72f3ea65889f3a215c894ca14ce173be
|
| 3 |
+
size 1323044
|
data/audio2/bus-stockholm-35-1041-a.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b76dfe5d2d25b4b7912af63110d11f32623cb8308872cd601cc1fbe6daac8ef8
|
| 3 |
+
size 1323044
|
data/audio2/bus-stockholm-35-1041-b.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9eb1bf29c4055d9b7863f4395b59a177873c1fb00f6e16f026597643f1339742
|
| 3 |
+
size 1323044
|
data/audio9/street_pedestrian-london-149-4500-c.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42ce95e42426e18ae1f25174147bfa799644a3064dcad7072b744239cef134af
|
| 3 |
+
size 1323044
|
data/iphone/clip_01.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe9b1dc52cd1eb21550847ba08b2c2ddc79443c378ce88945b55a4de9c3656bf
|
| 3 |
+
size 178178
|
data/iphone/clip_05.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:017691167b2b7e93fe52ce7e643ca76767986c478e4efe4c2a66bbbfaee2c99a
|
| 3 |
+
size 191832
|
data/laptop/clip_01.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f163fd7dc320b3c7ede45104fadff2f90d795f740c9a59156b8cb71613c9f773
|
| 3 |
+
size 160044
|
data/laptop/clip_05.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:56d5a2ca715e1dc1f08f02d619c1a1c770ea60378b721638f9f2aeffb4829233
|
| 3 |
+
size 160044
|
data/outtakes/manifest.csv
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
filename,mic_suffix,friendly_name,description,source_path
|
| 2 |
+
street_pedestrian-prague-203-5268-a.wav,a,Zoom F8 field recorder,TAU device A,data/audio9/street_pedestrian-prague-203-5268-a.wav
|
| 3 |
+
street_pedestrian-prague-203-5268-c.wav,c,iPhone SE,TAU device C,data/audio9/street_pedestrian-prague-203-5268-c.wav
|
| 4 |
+
street_pedestrian-prague-203-5268-b.wav,b,Samsung Galaxy S7,TAU device B,data/audio9/street_pedestrian-prague-203-5268-b.wav
|
| 5 |
+
metro-paris-50-1524-a.wav,a,Zoom F8 field recorder,TAU device A,data/audio2/metro-paris-50-1524-a.wav
|
| 6 |
+
metro-paris-50-1524-c.wav,c,iPhone SE,TAU device C,data/audio2/metro-paris-50-1524-c.wav
|
| 7 |
+
metro-paris-50-1524-b.wav,b,Samsung Galaxy S7,TAU device B,data/audio2/metro-paris-50-1524-b.wav
|
| 8 |
+
airport-london-205-6195-b.wav,b,Samsung Galaxy S7,TAU device B,data/audio/airport-london-205-6195-b.wav
|
| 9 |
+
airport-london-205-6195-c.wav,c,iPhone SE,TAU device C,data/audio/airport-london-205-6195-c.wav
|
| 10 |
+
airport-london-205-6195-a.wav,a,Zoom F8 field recorder,TAU device A,data/audio/airport-london-205-6195-a.wav
|
| 11 |
+
airport-lisbon-175-5340-b.wav,b,Samsung Galaxy S7,TAU device B,data/audio/airport-lisbon-175-5340-b.wav
|
| 12 |
+
airport-lisbon-175-5340-c.wav,c,iPhone SE,TAU device C,data/audio/airport-lisbon-175-5340-c.wav
|
| 13 |
+
airport-lisbon-175-5340-a.wav,a,Zoom F8 field recorder,TAU device A,data/audio/airport-lisbon-175-5340-a.wav
|
data/outtakes/metro-paris-50-1524-a.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e8a15fd1c204f18ff21d997d5e481ef7a79ea99ba9e12d119dce5999a5b8ad28
|
| 3 |
+
size 1323044
|
data/outtakes/metro-paris-50-1524-b.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1632c5c7a8d6a10d20ade4887bfd1544c96256c344da6e93e9d901dbea7830d5
|
| 3 |
+
size 1323044
|
data/outtakes/metro-paris-50-1524-c.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:da89e4129f83d73c648b7b4a9bff36fb594b191b5dc61371991725994af7f2a7
|
| 3 |
+
size 1323044
|
devices.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIC_FRIENDLY_NAMES = {
|
| 2 |
+
"audio": "Zoom F8 field recorder (TAU device A)",
|
| 3 |
+
"audio2": "Samsung Galaxy S7 (TAU device B)",
|
| 4 |
+
"audio9": "iPhone SE (TAU device C)",
|
| 5 |
+
"iphone": "Local iPhone recordings",
|
| 6 |
+
"laptop": "MacBook built-in microphone",
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def describe_label(label: str) -> str:
|
| 11 |
+
"""Return a human-readable microphone description for a raw label."""
|
| 12 |
+
return MIC_FRIENDLY_NAMES.get(label, label)
|
docs/data-sourcing.md
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Data Sourcing Guide
|
| 2 |
+
Mic-ID works best when every class corresponds to a capture device that has enough, diverse, and comparable recordings. The notes below list vetted public corpora and give you recipes for turning them into balanced training data without having to record long sessions yourself.
|
| 3 |
+
|
| 4 |
+
## What to look for
|
| 5 |
+
- **Parallel content**. Prefer datasets where the same scenes were captured on several devices so the label truly reflects the hardware, not the sound source.
|
| 6 |
+
- **Consistent preprocessing**. Resample to 16 kHz mono, trim silence, and loudness-normalize (the repo utilities already normalise to ≈-26 dBFS).
|
| 7 |
+
- **Usage rights**. Check every licence; most corpora below are CC BY or research-only.
|
| 8 |
+
|
| 9 |
+
## Recommended open datasets
|
| 10 |
+
### TAU Urban Acoustic Scenes 2019 Mobile (DCASE 2019 Task 1B)
|
| 11 |
+
- **Why it helps**: Every 10 s clip was recorded in parallel across three devices: Zoom F8 (device A), Samsung Galaxy S7 (device B), and iPhone SE (device C). Treat the device ID as the label.
|
| 12 |
+
- **Download**: Register at https://dcase.community/challenge2019/task-acoustic-scene-classification and grab the "TAU Urban Acoustic Scenes 2019 Mobile" archive (`TAU-urban-acoustic-scenes-2019-mobile-development.zip`).
|
| 13 |
+
- **Command-line** (after approval/network access):
|
| 14 |
+
```bash
|
| 15 |
+
mkdir -p data/raw && cd data/raw
|
| 16 |
+
wget https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-mobile-development.zip
|
| 17 |
+
unzip TAU-urban-acoustic-scenes-2019-mobile-development.zip
|
| 18 |
+
```
|
| 19 |
+
- **Prep**: Parallel recordings are stored under `A/`, `B/`, `C/`. Run the provided helper to convert to 16 kHz mono (create this file if it doesn’t exist yet):
|
| 20 |
+
```bash
|
| 21 |
+
python scripts/prepare_taus_mobile.py --input data/raw/TAU-urban-acoustic-scenes-2019-mobile-development --output data/taus_mobile
|
| 22 |
+
```
|
| 23 |
+
(See the README below for how to organise `scripts/prepare_taus_mobile.py`.)
|
| 24 |
+
- **Licence**: Creative Commons Attribution 4.0.
|
| 25 |
+
|
| 26 |
+
### ASVspoof 2019 – Physical Access subset
|
| 27 |
+
- **Why it helps**: Contains bona fide and replayed speech captured by 26 microphone / recorder pairs (high-quality mics plus several smartphones). Device identity is stored in the metadata (`rec_device`).
|
| 28 |
+
- **Download**: Free for research. Create an account at https://www.asvspoof.org/ and request the 2019 PA subset.
|
| 29 |
+
- **Command-line**:
|
| 30 |
+
```bash
|
| 31 |
+
mkdir -p data/raw && cd data/raw
|
| 32 |
+
wget https://datashare.ed.ac.uk/download/handle/10283/3336/ASVspoof2019_PA_dev.zip
|
| 33 |
+
unzip ASVspoof2019_PA_dev.zip
|
| 34 |
+
```
|
| 35 |
+
(Replace with the exact link you receive; ASVspoof frequently rotates URLs.)
|
| 36 |
+
- **Prep**: Use the protocol files `ASVspoof2019_PA_cm_protocols` to map each wav to its `rec_device`. You can pivot those IDs into the folder names Mic-ID expects:
|
| 37 |
+
```bash
|
| 38 |
+
python scripts/split_by_device.py --metadata data/raw/asvspoof2019/ASVspoof2019_PA_dev_cm_protocols/cm_protocols/PA_dev_cm.txt --audio-root data/raw/asvspoof2019/ASVspoof2019_PA_dev/ASVspoof2019_PA_dev --output-root data/asvspoof_devices
|
| 39 |
+
```
|
| 40 |
+
- **Licence**: Research-only; check that your use case complies.
|
| 41 |
+
|
| 42 |
+
### Freesound + gear metadata
|
| 43 |
+
- **Why it helps**: Many Freesound uploads include `gear` or `recording_device` tags such as "iphone_se" or "zoom_h4n". You can scrape curated lists rather than recording yourself.
|
| 44 |
+
- **Download**: Requires a (free) Freesound API key.
|
| 45 |
+
- **Command-line**:
|
| 46 |
+
```bash
|
| 47 |
+
export FREESOUND_API_KEY=...
|
| 48 |
+
python scripts/freesound_pull.py --query "recording_device:iphone" --max-clips 200 --label iphone
|
| 49 |
+
python scripts/freesound_pull.py --query "recording_device:samsung" --max-clips 200 --label galaxy_s7
|
| 50 |
+
```
|
| 51 |
+
- **Prep**: The script should normalise file names and audio format to match `data/<device>/clip_xx.wav`. Keep a CSV with original URLs for attribution.
|
| 52 |
+
- **Licence**: Clip-specific; many are CC BY or CC0. Honour attribution where required.
|
| 53 |
+
|
| 54 |
+
## Balancing and augmentation tips
|
| 55 |
+
- Aim for 60–100 clips per class before augmentation. Mix quiet/noisy scenes to avoid overfitting.
|
| 56 |
+
- Apply simple augmentations (noise injection, EQ, impulse responses) per device to highlight microphone artefacts rather than content.
|
| 57 |
+
- Track provenance in `data/metadata.csv` (`filename,device,source,licence`).
|
| 58 |
+
- Keep a held-out validation split per device to spot leakage from near-duplicate clips.
|
| 59 |
+
|
| 60 |
+
## Next steps
|
| 61 |
+
1. Implement the helper scripts mentioned above under `scripts/`. Use `librosa`/`soundfile` so prep stays in Python.
|
| 62 |
+
2. Store downloaded archives under `data/raw/` (ignored by git) and export processed clips to `data/<device>/`.
|
| 63 |
+
3. Update `metadata.csv` whenever you add or remove external clips so the experiment log in `reports/` stays reproducible.
|
| 64 |
+
|
| 65 |
+
For more ideas, browse the DCASE and ASVspoof challenge leaderboards—winning teams usually publish their data prep notes and often release additional impulse responses or parallel recordings.
|
features.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np, librosa
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def load_mono(path, sr=16000):
|
| 5 |
+
x, sr = librosa.load(path, sr=sr, mono=True)
|
| 6 |
+
x, _ = librosa.effects.trim(x, top_db=30)
|
| 7 |
+
rms = np.sqrt(np.mean(x**2)) + 1e-8
|
| 8 |
+
x = x * (0.05 / rms) # simple RMS target ≈ loudness norm
|
| 9 |
+
return x, sr
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def extract_features(x, sr=16000, n_mels=64, n_mfcc=20):
|
| 13 |
+
S = librosa.feature.melspectrogram(y=x, sr=sr, n_mels=n_mels)
|
| 14 |
+
logmel = librosa.power_to_db(S + 1e-9)
|
| 15 |
+
logmel_stats = np.hstack([logmel.mean(axis=1), logmel.std(axis=1)])
|
| 16 |
+
|
| 17 |
+
mfcc = librosa.feature.mfcc(S=librosa.power_to_db(S + 1e-9), sr=sr, n_mfcc=n_mfcc)
|
| 18 |
+
frames = mfcc.shape[1]
|
| 19 |
+
width = min(9, frames if frames % 2 else frames - 1)
|
| 20 |
+
if width < 3:
|
| 21 |
+
d1 = np.zeros_like(mfcc)
|
| 22 |
+
d2 = np.zeros_like(mfcc)
|
| 23 |
+
else:
|
| 24 |
+
d1 = librosa.feature.delta(mfcc, width=width)
|
| 25 |
+
d2 = librosa.feature.delta(mfcc, width=width, order=2)
|
| 26 |
+
mfcc_stats = np.hstack([mfcc.mean(axis=1), mfcc.std(axis=1),
|
| 27 |
+
d1.mean(axis=1), d1.std(axis=1),
|
| 28 |
+
d2.mean(axis=1), d2.std(axis=1)])
|
| 29 |
+
|
| 30 |
+
zcr = librosa.feature.zero_crossing_rate(x).mean()
|
| 31 |
+
centroid = librosa.feature.spectral_centroid(y=x, sr=sr).mean()
|
| 32 |
+
rolloff = librosa.feature.spectral_rolloff(y=x, sr=sr).mean()
|
| 33 |
+
flatness = librosa.feature.spectral_flatness(y=x).mean()
|
| 34 |
+
return np.hstack([logmel_stats, mfcc_stats, [zcr, centroid, rolloff, flatness]])
|
models/label_encoder.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b63e81ce06710e7e8cc2dd245a0960912697516459129ce32657a5b0234cbd49
|
| 3 |
+
size 447
|
models/model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9bd34e60ea1f1851d23b6808d4d0ad6ca2a10968322b527dd01f32b4e8761e0b
|
| 3 |
+
size 2006992
|
packages.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
libsndfile1
|
| 2 |
+
ffmpeg
|
predict.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Quick CLI to score audio clips with the trained Mic-ID model."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import io
|
| 8 |
+
import os
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
import joblib
|
| 12 |
+
import librosa
|
| 13 |
+
import numpy as np
|
| 14 |
+
|
| 15 |
+
BASE_DIR = Path(__file__).resolve().parent
|
| 16 |
+
CACHE_ROOT = BASE_DIR / ".cache"
|
| 17 |
+
NUMBA_CACHE_DIR = CACHE_ROOT / "numba"
|
| 18 |
+
MPL_CACHE_DIR = CACHE_ROOT / "matplotlib"
|
| 19 |
+
for path in (NUMBA_CACHE_DIR, MPL_CACHE_DIR):
|
| 20 |
+
path.mkdir(parents=True, exist_ok=True)
|
| 21 |
+
os.environ.setdefault("NUMBA_CACHE_DIR", str(NUMBA_CACHE_DIR))
|
| 22 |
+
os.environ.setdefault("MPLCONFIGDIR", str(MPL_CACHE_DIR))
|
| 23 |
+
|
| 24 |
+
from features import extract_features
|
| 25 |
+
from devices import describe_label
|
| 26 |
+
|
| 27 |
+
MODEL_PATH = Path("models/model.pkl")
|
| 28 |
+
ENCODER_PATH = Path("models/label_encoder.pkl")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def load_model():
|
| 32 |
+
if not MODEL_PATH.exists() or not ENCODER_PATH.exists():
|
| 33 |
+
raise SystemExit("Trained artefacts not found. Run `python train.py` first.")
|
| 34 |
+
clf = joblib.load(MODEL_PATH)
|
| 35 |
+
le = joblib.load(ENCODER_PATH)
|
| 36 |
+
return clf, le
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def load_audio(path: Path, sr: int = 16000) -> tuple[np.ndarray, int]:
|
| 40 |
+
if path.suffix.lower() == ".wav":
|
| 41 |
+
y, sr = librosa.load(path, sr=sr, mono=True)
|
| 42 |
+
return y, sr
|
| 43 |
+
# fall back to BytesIO so we also support .mp3/.m4a just like the Streamlit app
|
| 44 |
+
with path.open("rb") as f:
|
| 45 |
+
data = io.BytesIO(f.read())
|
| 46 |
+
y, sr = librosa.load(data, sr=sr, mono=True)
|
| 47 |
+
return y, sr
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def normalise_audio(y: np.ndarray) -> np.ndarray:
|
| 51 |
+
rms = float(np.sqrt(np.mean(y**2)) + 1e-8)
|
| 52 |
+
return y * (0.05 / rms), rms
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def main() -> None:
|
| 56 |
+
parser = argparse.ArgumentParser(description="Score WAV/MP3/M4A clips with the Mic-ID classifier.")
|
| 57 |
+
parser.add_argument("paths", nargs="+", type=Path, help="Audio files to score")
|
| 58 |
+
parser.add_argument("--topk", type=int, default=3, help="How many ranked predictions to show per file")
|
| 59 |
+
args = parser.parse_args()
|
| 60 |
+
|
| 61 |
+
clf, le = load_model()
|
| 62 |
+
topk = max(1, min(args.topk, len(le.classes_)))
|
| 63 |
+
|
| 64 |
+
for path in args.paths:
|
| 65 |
+
if not path.exists():
|
| 66 |
+
print(f"[!] Skipping missing file: {path}")
|
| 67 |
+
continue
|
| 68 |
+
try:
|
| 69 |
+
y, sr = load_audio(path)
|
| 70 |
+
except Exception as exc: # pragma: no cover - friendly CLI message
|
| 71 |
+
print(f"[!] Failed to load {path}: {exc}")
|
| 72 |
+
continue
|
| 73 |
+
y, rms = normalise_audio(y)
|
| 74 |
+
feats = extract_features(y, sr).reshape(1, -1)
|
| 75 |
+
proba = clf.predict_proba(feats)[0]
|
| 76 |
+
order = np.argsort(proba)[::-1]
|
| 77 |
+
print(f"\nFile: {path}")
|
| 78 |
+
print(f"RMS loudness: {20 * np.log10(rms + 1e-12):.1f} dBFS")
|
| 79 |
+
for rank, idx in enumerate(order[:topk], start=1):
|
| 80 |
+
label = le.classes_[idx]
|
| 81 |
+
friendly = describe_label(label)
|
| 82 |
+
print(f" {rank}. {friendly} — {proba[idx] * 100:.1f}%")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
if __name__ == "__main__":
|
| 86 |
+
main()
|
reports/confusion_matrix.png
ADDED
|
Git LFS Details
|
reports/metrics.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio": {
|
| 3 |
+
"precision": 0.9726027397260274,
|
| 4 |
+
"recall": 0.9594594594594594,
|
| 5 |
+
"f1-score": 0.9659863945578231,
|
| 6 |
+
"support": 74.0
|
| 7 |
+
},
|
| 8 |
+
"audio2": {
|
| 9 |
+
"precision": 0.9864864864864865,
|
| 10 |
+
"recall": 0.9864864864864865,
|
| 11 |
+
"f1-score": 0.9864864864864865,
|
| 12 |
+
"support": 74.0
|
| 13 |
+
},
|
| 14 |
+
"audio9": {
|
| 15 |
+
"precision": 0.9605263157894737,
|
| 16 |
+
"recall": 0.9864864864864865,
|
| 17 |
+
"f1-score": 0.9733333333333334,
|
| 18 |
+
"support": 74.0
|
| 19 |
+
},
|
| 20 |
+
"iphone": {
|
| 21 |
+
"precision": 1.0,
|
| 22 |
+
"recall": 0.75,
|
| 23 |
+
"f1-score": 0.8571428571428571,
|
| 24 |
+
"support": 4.0
|
| 25 |
+
},
|
| 26 |
+
"laptop": {
|
| 27 |
+
"precision": 0.6666666666666666,
|
| 28 |
+
"recall": 0.6666666666666666,
|
| 29 |
+
"f1-score": 0.6666666666666666,
|
| 30 |
+
"support": 3.0
|
| 31 |
+
},
|
| 32 |
+
"accuracy": 0.9694323144104804,
|
| 33 |
+
"macro avg": {
|
| 34 |
+
"precision": 0.9172564417337309,
|
| 35 |
+
"recall": 0.8698198198198199,
|
| 36 |
+
"f1-score": 0.8899231476374334,
|
| 37 |
+
"support": 229.0
|
| 38 |
+
},
|
| 39 |
+
"weighted avg": {
|
| 40 |
+
"precision": 0.969657424053044,
|
| 41 |
+
"recall": 0.9694323144104804,
|
| 42 |
+
"f1-score": 0.969162582063393,
|
| 43 |
+
"support": 229.0
|
| 44 |
+
}
|
| 45 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
librosa
|
| 3 |
+
soundfile
|
| 4 |
+
sounddevice
|
| 5 |
+
scikit-learn
|
| 6 |
+
numpy
|
| 7 |
+
pandas
|
| 8 |
+
matplotlib
|
| 9 |
+
joblib
|
scripts/export_outtakes.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Copy a handful of parallel TAU Mobile clips into an outtakes folder for manual testing.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
python scripts/export_outtakes.py --count 5 --output data/outtakes
|
| 6 |
+
|
| 7 |
+
The script searches under the training `data/` tree for clip prefixes that include
|
| 8 |
+
all three device suffixes (-a/-b/-c) and copies them (without removal) to the
|
| 9 |
+
chosen output directory. Each exported clip retains its original file name and
|
| 10 |
+
is accompanied by a `manifest.csv` describing the microphone mapping.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import argparse
|
| 16 |
+
import csv
|
| 17 |
+
import random
|
| 18 |
+
import shutil
|
| 19 |
+
from collections import defaultdict
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
|
| 22 |
+
MIC_SUFFIX_MAP = {
|
| 23 |
+
"a": ("Zoom F8 field recorder", "TAU device A"),
|
| 24 |
+
"b": ("Samsung Galaxy S7", "TAU device B"),
|
| 25 |
+
"c": ("iPhone SE", "TAU device C"),
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def find_triplets(data_root: Path):
|
| 30 |
+
groups: dict[str, dict[str, Path]] = defaultdict(dict)
|
| 31 |
+
for wav_path in data_root.rglob("*.wav"):
|
| 32 |
+
name = wav_path.name
|
| 33 |
+
if len(name) < 6 or not name.endswith(".wav"):
|
| 34 |
+
continue
|
| 35 |
+
suffix = name[-5]
|
| 36 |
+
if suffix not in MIC_SUFFIX_MAP:
|
| 37 |
+
continue
|
| 38 |
+
prefix = name[:-6]
|
| 39 |
+
groups[prefix][suffix] = wav_path
|
| 40 |
+
# Keep only groups with all three devices
|
| 41 |
+
return {prefix: mapping for prefix, mapping in groups.items() if set(mapping) == set(MIC_SUFFIX_MAP)}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def export_triplets(groups: dict[str, dict[str, Path]], output_dir: Path, count: int, seed: int | None) -> list[tuple[str, str, Path]]:
|
| 45 |
+
if not groups:
|
| 46 |
+
return []
|
| 47 |
+
rng = random.Random(seed)
|
| 48 |
+
prefixes = sorted(groups)
|
| 49 |
+
rng.shuffle(prefixes)
|
| 50 |
+
selected = prefixes[: min(count, len(prefixes))]
|
| 51 |
+
manifest_rows: list[tuple[str, str, Path]] = []
|
| 52 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 53 |
+
for prefix in selected:
|
| 54 |
+
mapping = groups[prefix]
|
| 55 |
+
for suffix, path in mapping.items():
|
| 56 |
+
dest = output_dir / path.name
|
| 57 |
+
shutil.copy2(path, dest)
|
| 58 |
+
manifest_rows.append((dest.name, suffix, path))
|
| 59 |
+
return manifest_rows
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def write_manifest(rows: list[tuple[str, str, Path]], manifest_path: Path) -> None:
|
| 63 |
+
if not rows:
|
| 64 |
+
return
|
| 65 |
+
with manifest_path.open("w", newline="") as f:
|
| 66 |
+
writer = csv.writer(f)
|
| 67 |
+
writer.writerow(["filename", "mic_suffix", "friendly_name", "description", "source_path"])
|
| 68 |
+
for filename, suffix, source in rows:
|
| 69 |
+
friendly, description = MIC_SUFFIX_MAP[suffix]
|
| 70 |
+
writer.writerow([filename, suffix, friendly, description, str(source)])
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def main() -> None:
|
| 74 |
+
parser = argparse.ArgumentParser(description="Export a handful of TAU Mobile triplets for testing.")
|
| 75 |
+
parser.add_argument("--data-root", default="data", type=Path, help="Root directory containing per-device folders (default: data)")
|
| 76 |
+
parser.add_argument("--output", default="data/outtakes", type=Path, help="Where to copy the selected clips")
|
| 77 |
+
parser.add_argument("--count", type=int, default=5, help="Number of triplet prefixes to copy (default: 5)")
|
| 78 |
+
parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
|
| 79 |
+
args = parser.parse_args()
|
| 80 |
+
|
| 81 |
+
groups = find_triplets(args.data_root)
|
| 82 |
+
if not groups:
|
| 83 |
+
raise SystemExit("No complete triplets (-a/-b/-c) were found under the data root.")
|
| 84 |
+
|
| 85 |
+
rows = export_triplets(groups, args.output, args.count, args.seed)
|
| 86 |
+
if not rows:
|
| 87 |
+
raise SystemExit("Triplet export produced no files. Try lowering --count.")
|
| 88 |
+
|
| 89 |
+
write_manifest(rows, args.output / "manifest.csv")
|
| 90 |
+
print(f"Exported {len(rows)} wav files to {args.output} (covering {len(rows) // 3} triplets).")
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
main()
|
train.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import glob
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
BASE_DIR = Path(__file__).resolve().parent
|
| 7 |
+
CACHE_ROOT = BASE_DIR / ".cache"
|
| 8 |
+
NUMBA_CACHE_DIR = CACHE_ROOT / "numba"
|
| 9 |
+
MPL_CACHE_DIR = CACHE_ROOT / "matplotlib"
|
| 10 |
+
for path in (NUMBA_CACHE_DIR, MPL_CACHE_DIR):
|
| 11 |
+
path.mkdir(parents=True, exist_ok=True)
|
| 12 |
+
os.environ.setdefault("NUMBA_CACHE_DIR", str(NUMBA_CACHE_DIR))
|
| 13 |
+
os.environ.setdefault("MPLCONFIGDIR", str(MPL_CACHE_DIR))
|
| 14 |
+
|
| 15 |
+
import numpy as np
|
| 16 |
+
import matplotlib
|
| 17 |
+
matplotlib.use("Agg", force=True)
|
| 18 |
+
import matplotlib.pyplot as plt
|
| 19 |
+
from sklearn.ensemble import HistGradientBoostingClassifier
|
| 20 |
+
from sklearn.preprocessing import LabelEncoder
|
| 21 |
+
from sklearn.metrics import classification_report, confusion_matrix
|
| 22 |
+
from sklearn.model_selection import train_test_split
|
| 23 |
+
import joblib
|
| 24 |
+
|
| 25 |
+
from features import load_mono, extract_features
|
| 26 |
+
|
| 27 |
+
DATA_DIR, MODEL_DIR, REPORT_DIR = "data", "models", "reports"
|
| 28 |
+
os.makedirs(MODEL_DIR, exist_ok=True); os.makedirs(REPORT_DIR, exist_ok=True)
|
| 29 |
+
|
| 30 |
+
IGNORED_DEVICES = {"outtakes"}
|
| 31 |
+
SUFFIX_TO_DEVICE = {
|
| 32 |
+
"a": "audio",
|
| 33 |
+
"b": "audio2",
|
| 34 |
+
"c": "audio9",
|
| 35 |
+
}
|
| 36 |
+
TAU_DEVICE_DIRS = set(SUFFIX_TO_DEVICE.values())
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def resolve_device_label(device_dir: str, wav_path: str) -> str:
|
| 40 |
+
"""Infer the correct device label for a wav file.
|
| 41 |
+
|
| 42 |
+
TAU scenes live under per-device directories but each folder still contains
|
| 43 |
+
the parallel `-a/-b/-c` recordings. Instead of trusting the directory name
|
| 44 |
+
(which mislabels the clips), derive the device from the filename suffix and
|
| 45 |
+
fall back to the directory label for any locally recorded additions that
|
| 46 |
+
do not follow that convention.
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
if device_dir in TAU_DEVICE_DIRS:
|
| 50 |
+
stem = Path(wav_path).stem
|
| 51 |
+
if "-" in stem:
|
| 52 |
+
_, suffix = stem.rsplit("-", 1)
|
| 53 |
+
if suffix in SUFFIX_TO_DEVICE:
|
| 54 |
+
return SUFFIX_TO_DEVICE[suffix]
|
| 55 |
+
return device_dir
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def load_dataset():
|
| 59 |
+
X, y = [], []
|
| 60 |
+
seen: set[tuple[str, str]] = set()
|
| 61 |
+
for device in sorted(
|
| 62 |
+
d for d in os.listdir(DATA_DIR)
|
| 63 |
+
if os.path.isdir(os.path.join(DATA_DIR, d))
|
| 64 |
+
and not d.startswith(".")
|
| 65 |
+
and d not in IGNORED_DEVICES
|
| 66 |
+
):
|
| 67 |
+
for wav in glob.glob(os.path.join(DATA_DIR, device, "*.wav")):
|
| 68 |
+
label = resolve_device_label(device, wav)
|
| 69 |
+
key = (os.path.basename(wav), label)
|
| 70 |
+
if key in seen:
|
| 71 |
+
continue
|
| 72 |
+
seen.add(key)
|
| 73 |
+
x, sr = load_mono(wav); feats = extract_features(x, sr)
|
| 74 |
+
X.append(feats); y.append(label)
|
| 75 |
+
return np.array(X), np.array(y)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
if __name__ == "__main__":
|
| 79 |
+
X, y = load_dataset()
|
| 80 |
+
le = LabelEncoder(); y_enc = le.fit_transform(y)
|
| 81 |
+
Xtr, Xte, ytr, yte = train_test_split(X, y_enc, test_size=0.25, stratify=y_enc, random_state=42)
|
| 82 |
+
|
| 83 |
+
clf = HistGradientBoostingClassifier(max_depth=10, max_iter=400, learning_rate=0.08, random_state=42)
|
| 84 |
+
clf.fit(Xtr, ytr); yhat = clf.predict(Xte)
|
| 85 |
+
|
| 86 |
+
report = classification_report(yte, yhat, target_names=le.classes_, output_dict=True)
|
| 87 |
+
with open(os.path.join(REPORT_DIR, "metrics.json"), "w") as f: json.dump(report, f, indent=2)
|
| 88 |
+
|
| 89 |
+
cm = confusion_matrix(yte, yhat, normalize="true")
|
| 90 |
+
fig, ax = plt.subplots(figsize=(5,4)); im = ax.imshow(cm, cmap="Blues")
|
| 91 |
+
ax.set_xticks(range(len(le.classes_))); ax.set_xticklabels(le.classes_, rotation=45, ha="right")
|
| 92 |
+
ax.set_yticks(range(len(le.classes_))); ax.set_yticklabels(le.classes_)
|
| 93 |
+
for i in range(len(le.classes_)):
|
| 94 |
+
for j in range(len(le.classes_)):
|
| 95 |
+
ax.text(j, i, f"{cm[i,j]:.2f}", ha="center", va="center", fontsize=8)
|
| 96 |
+
ax.set_title("Confusion (normalized)"); fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04); fig.tight_layout()
|
| 97 |
+
fig.savefig(os.path.join(REPORT_DIR, "confusion_matrix.png"), dpi=160)
|
| 98 |
+
|
| 99 |
+
if hasattr(clf, "_feature_subsample_rng"):
|
| 100 |
+
clf._feature_subsample_rng = None
|
| 101 |
+
|
| 102 |
+
joblib.dump(clf, os.path.join(MODEL_DIR, "model.pkl"))
|
| 103 |
+
joblib.dump(le, os.path.join(MODEL_DIR, "label_encoder.pkl"))
|
| 104 |
+
print("Saved model + reports.")
|
utils.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, argparse, numpy as np, sounddevice as sd, soundfile as sf, time
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def record_clips(device_name: str, count: int, seconds: float, sr: int = 16000):
|
| 5 |
+
outdir = os.path.join("data", device_name); os.makedirs(outdir, exist_ok=True)
|
| 6 |
+
print(f"Recording {count} clips x {seconds}s to {outdir} @ {sr}Hz. Ctrl-C to stop.")
|
| 7 |
+
try:
|
| 8 |
+
for i in range(1, count+1):
|
| 9 |
+
print(f"[{i}/{count}] Starts in 1s… speak normally."); time.sleep(1)
|
| 10 |
+
audio = sd.rec(int(seconds*sr), samplerate=sr, channels=1, dtype="float32"); sd.wait()
|
| 11 |
+
x = audio.squeeze(); rms = float(np.sqrt(np.mean(x**2)) + 1e-8); print(f"RMS {rms:.4f}")
|
| 12 |
+
sf.write(os.path.join(outdir, f"clip_{i:02d}.wav"), x, sr)
|
| 13 |
+
except KeyboardInterrupt:
|
| 14 |
+
print("\nStopped.")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
if __name__ == "__main__":
|
| 18 |
+
ap = argparse.ArgumentParser()
|
| 19 |
+
ap.add_argument("--device", required=True); ap.add_argument("--count", type=int, default=15)
|
| 20 |
+
ap.add_argument("--seconds", type=float, default=5.0)
|
| 21 |
+
args = ap.parse_args(); record_clips(args.device, args.count, args.seconds)
|