Update app.py
Browse files
app.py
CHANGED
|
@@ -2,7 +2,9 @@ import io
|
|
| 2 |
import os
|
| 3 |
import tempfile
|
| 4 |
from pathlib import Path
|
|
|
|
| 5 |
|
|
|
|
| 6 |
BASE_DIR = Path(__file__).resolve().parent
|
| 7 |
CACHE_ROOT = BASE_DIR / ".cache"
|
| 8 |
NUMBA_CACHE_DIR = CACHE_ROOT / "numba"
|
|
@@ -12,14 +14,15 @@ for cache_dir in (NUMBA_CACHE_DIR, MPL_CACHE_DIR):
|
|
| 12 |
os.environ.setdefault("NUMBA_CACHE_DIR", str(NUMBA_CACHE_DIR))
|
| 13 |
os.environ.setdefault("MPLCONFIGDIR", str(MPL_CACHE_DIR))
|
| 14 |
|
|
|
|
| 15 |
import joblib
|
| 16 |
import numpy as np
|
| 17 |
import pandas as pd
|
| 18 |
import streamlit as st
|
| 19 |
|
| 20 |
-
|
|
|
|
| 21 |
from streamlit.web import cli as stcli
|
| 22 |
-
|
| 23 |
import sys
|
| 24 |
|
| 25 |
port = os.environ.get("PORT", "7860")
|
|
@@ -37,39 +40,30 @@ if not st.runtime.exists():.
|
|
| 37 |
]
|
| 38 |
sys.exit(stcli.main())
|
| 39 |
|
|
|
|
| 40 |
import librosa
|
| 41 |
import librosa.display
|
| 42 |
import matplotlib.pyplot as plt
|
| 43 |
|
|
|
|
| 44 |
from features import extract_features
|
| 45 |
from devices import describe_label
|
| 46 |
|
| 47 |
-
|
| 48 |
NOTE_NAMES = [
|
| 49 |
-
"C",
|
| 50 |
-
"
|
| 51 |
-
"D",
|
| 52 |
-
"D#",
|
| 53 |
-
"E",
|
| 54 |
-
"F",
|
| 55 |
-
"F#",
|
| 56 |
-
"G",
|
| 57 |
-
"G#",
|
| 58 |
-
"A",
|
| 59 |
-
"A#",
|
| 60 |
-
"B",
|
| 61 |
]
|
| 62 |
MAJOR_PROFILE = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
|
| 63 |
MINOR_PROFILE = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
|
|
|
|
| 64 |
UPLOAD_DIR = BASE_DIR / "uploads"
|
| 65 |
MODEL_PATH = BASE_DIR / "models" / "model.pkl"
|
| 66 |
ENCODER_PATH = BASE_DIR / "models" / "label_encoder.pkl"
|
| 67 |
-
UPLOAD_DIR.mkdir(exist_ok=True)
|
| 68 |
-
|
| 69 |
|
| 70 |
-
def estimate_scale(y: np.ndarray, sr: int) -> str
|
| 71 |
"""Return a rough musical scale (e.g., 'C major') or None if unclear."""
|
| 72 |
-
|
| 73 |
if y.size == 0:
|
| 74 |
return None
|
| 75 |
chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
|
|
@@ -98,6 +92,7 @@ def estimate_scale(y: np.ndarray, sr: int) -> str | None:
|
|
| 98 |
return f"{NOTE_NAMES[best_major]} major"
|
| 99 |
return f"{NOTE_NAMES[best_minor]} minor"
|
| 100 |
|
|
|
|
| 101 |
st.set_page_config(page_title="Mic-ID (MVP)", layout="centered")
|
| 102 |
st.title("Mic-ID (MVP)")
|
| 103 |
st.caption("Upload ~5s audio - guess the recording device")
|
|
@@ -105,15 +100,14 @@ st.caption("Upload ~5s audio - guess the recording device")
|
|
| 105 |
with st.expander("Training data & devices", expanded=False):
|
| 106 |
st.markdown(
|
| 107 |
"""
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
|
| 112 |
-
|
| 113 |
"""
|
| 114 |
)
|
| 115 |
|
| 116 |
-
|
| 117 |
@st.cache_resource
|
| 118 |
def load_model():
|
| 119 |
try:
|
|
@@ -124,7 +118,6 @@ def load_model():
|
|
| 124 |
st.warning(f"Could not load trained artefacts: {exc}")
|
| 125 |
return None, None
|
| 126 |
|
| 127 |
-
|
| 128 |
clf, le = load_model()
|
| 129 |
topk = None
|
| 130 |
if clf and le is not None:
|
|
@@ -139,7 +132,7 @@ if clf and le is not None:
|
|
| 139 |
)
|
| 140 |
st.caption("The slider above only changes how many ranked predictions you see.")
|
| 141 |
|
| 142 |
-
file = st.file_uploader("Upload WAV/MP3/M4A", type=["wav","mp3","m4a"])
|
| 143 |
|
| 144 |
if file and clf and le is not None:
|
| 145 |
data = file.read()
|
|
@@ -148,6 +141,8 @@ if file and clf and le is not None:
|
|
| 148 |
saved_path = UPLOAD_DIR / renamed_name
|
| 149 |
saved_path.write_bytes(data)
|
| 150 |
st.caption(f"Saved a copy as `{saved_path}`.")
|
|
|
|
|
|
|
| 151 |
try:
|
| 152 |
y, sr = librosa.load(io.BytesIO(data), sr=16000, mono=True)
|
| 153 |
except Exception:
|
|
@@ -156,34 +151,50 @@ if file and clf and le is not None:
|
|
| 156 |
tmp.write(data)
|
| 157 |
tmp.flush()
|
| 158 |
y, sr = librosa.load(tmp.name, sr=16000, mono=True)
|
|
|
|
| 159 |
raw_y = y.copy()
|
| 160 |
rms = np.sqrt(np.mean(raw_y**2)) + 1e-8
|
| 161 |
scale = estimate_scale(raw_y, sr)
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
feats = extract_features(y, 16000).reshape(1, -1)
|
| 164 |
proba = clf.predict_proba(feats)[0]
|
| 165 |
idx = np.argsort(proba)[::-1]
|
|
|
|
| 166 |
st.subheader("Prediction")
|
| 167 |
if scale:
|
| 168 |
st.write(f"Estimated scale: **{scale}** (experimental)")
|
| 169 |
else:
|
| 170 |
st.write("Scale detection: the clip lacked clear musical content, so no scale estimate.")
|
| 171 |
st.write(f"Input loudness (RMS): {20 * np.log10(rms + 1e-12):.1f} dBFS")
|
|
|
|
| 172 |
limit = topk or 3
|
| 173 |
for i in idx[:limit]:
|
| 174 |
label = le.classes_[i]
|
| 175 |
st.write(f"{describe_label(label)} — **{proba[i] * 100:.1f}%**")
|
|
|
|
|
|
|
| 176 |
friendly_index = [describe_label(label) for label in le.classes_]
|
| 177 |
st.bar_chart(pd.Series(proba, index=friendly_index))
|
| 178 |
|
|
|
|
| 179 |
with st.expander("How the model listens", expanded=False):
|
| 180 |
st.markdown(
|
| 181 |
-
"We tidy the audio (level it, pull out key frequencies) and let the classifier score that summary.
|
|
|
|
| 182 |
)
|
| 183 |
|
| 184 |
duration = raw_y.size / sr if raw_y.size else 0
|
| 185 |
-
times =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
|
|
|
| 187 |
fig_wave, ax_wave = plt.subplots(figsize=(6, 2))
|
| 188 |
if raw_y.size:
|
| 189 |
ax_wave.plot(times, raw_y, linewidth=0.8, color="#1f77b4")
|
|
@@ -195,15 +206,19 @@ if file and clf and le is not None:
|
|
| 195 |
st.pyplot(fig_wave, use_container_width=True)
|
| 196 |
plt.close(fig_wave)
|
| 197 |
|
|
|
|
| 198 |
mel = librosa.feature.melspectrogram(y=raw_y, sr=sr, n_fft=2048, hop_length=512, n_mels=64)
|
| 199 |
mel_db = librosa.power_to_db(mel, ref=np.max) if mel.size else mel
|
| 200 |
fig_spec, ax_spec = plt.subplots(figsize=(6, 3))
|
| 201 |
if mel.size:
|
| 202 |
-
img = librosa.display.specshow(
|
|
|
|
|
|
|
| 203 |
cbar = fig_spec.colorbar(img, ax=ax_spec, format="%+2.f dB")
|
| 204 |
cbar.set_label("Energy (dB)")
|
| 205 |
ax_spec.set_title("Log-mel spectrogram (what the model summarises)")
|
| 206 |
st.pyplot(fig_spec, use_container_width=True)
|
| 207 |
plt.close(fig_spec)
|
|
|
|
| 208 |
elif file and not clf:
|
| 209 |
st.warning("No trained model found. Run `python train.py` first.")
|
|
|
|
| 2 |
import os
|
| 3 |
import tempfile
|
| 4 |
from pathlib import Path
|
| 5 |
+
from typing import Optional
|
| 6 |
|
| 7 |
+
# ---------- Paths & caches ----------
|
| 8 |
BASE_DIR = Path(__file__).resolve().parent
|
| 9 |
CACHE_ROOT = BASE_DIR / ".cache"
|
| 10 |
NUMBA_CACHE_DIR = CACHE_ROOT / "numba"
|
|
|
|
| 14 |
os.environ.setdefault("NUMBA_CACHE_DIR", str(NUMBA_CACHE_DIR))
|
| 15 |
os.environ.setdefault("MPLCONFIGDIR", str(MPL_CACHE_DIR))
|
| 16 |
|
| 17 |
+
# ---------- Core deps ----------
|
| 18 |
import joblib
|
| 19 |
import numpy as np
|
| 20 |
import pandas as pd
|
| 21 |
import streamlit as st
|
| 22 |
|
| 23 |
+
# If launched as `python app.py`, re-exec under `streamlit run ...`
|
| 24 |
+
if not st.runtime.exists():
|
| 25 |
from streamlit.web import cli as stcli
|
|
|
|
| 26 |
import sys
|
| 27 |
|
| 28 |
port = os.environ.get("PORT", "7860")
|
|
|
|
| 40 |
]
|
| 41 |
sys.exit(stcli.main())
|
| 42 |
|
| 43 |
+
# ---------- Audio & plotting ----------
|
| 44 |
import librosa
|
| 45 |
import librosa.display
|
| 46 |
import matplotlib.pyplot as plt
|
| 47 |
|
| 48 |
+
# ---------- Local modules ----------
|
| 49 |
from features import extract_features
|
| 50 |
from devices import describe_label
|
| 51 |
|
| 52 |
+
# ---------- Music key estimation helpers ----------
|
| 53 |
NOTE_NAMES = [
|
| 54 |
+
"C", "C#", "D", "D#", "E", "F",
|
| 55 |
+
"F#", "G", "G#", "A", "A#", "B",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
]
|
| 57 |
MAJOR_PROFILE = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
|
| 58 |
MINOR_PROFILE = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
|
| 59 |
+
|
| 60 |
UPLOAD_DIR = BASE_DIR / "uploads"
|
| 61 |
MODEL_PATH = BASE_DIR / "models" / "model.pkl"
|
| 62 |
ENCODER_PATH = BASE_DIR / "models" / "label_encoder.pkl"
|
| 63 |
+
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 64 |
|
| 65 |
+
def estimate_scale(y: np.ndarray, sr: int) -> Optional[str]:
|
| 66 |
"""Return a rough musical scale (e.g., 'C major') or None if unclear."""
|
|
|
|
| 67 |
if y.size == 0:
|
| 68 |
return None
|
| 69 |
chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
|
|
|
|
| 92 |
return f"{NOTE_NAMES[best_major]} major"
|
| 93 |
return f"{NOTE_NAMES[best_minor]} minor"
|
| 94 |
|
| 95 |
+
# ---------- UI ----------
|
| 96 |
st.set_page_config(page_title="Mic-ID (MVP)", layout="centered")
|
| 97 |
st.title("Mic-ID (MVP)")
|
| 98 |
st.caption("Upload ~5s audio - guess the recording device")
|
|
|
|
| 100 |
with st.expander("Training data & devices", expanded=False):
|
| 101 |
st.markdown(
|
| 102 |
"""
|
| 103 |
+
- **TAU Urban Acoustic Scenes 2019 Mobile**: 295 parallel scenes where the same moment was captured on three devices – Zoom F8 (device A, clips ending in `-a`), Samsung Galaxy S7 (device B, `-b`), and iPhone SE (device C, `-c`). We only keep folders containing a full `-a/-b/-c` triplet, so each mic has 295 clips.
|
| 104 |
+
- **Local additions**: 4 laptop and 4 iPhone recordings collected with `utils.py` to anchor the classifier on in-house gear.
|
| 105 |
+
- **Features & model**: log-mel + MFCC statistics flow into a histogram-based gradient boosting classifier tuned for this small balanced set.
|
| 106 |
|
| 107 |
+
Want more coverage? Record new clips under `data/<device>/` or export outtakes with `scripts/export_outtakes.py` before retraining via `python train.py`.
|
| 108 |
"""
|
| 109 |
)
|
| 110 |
|
|
|
|
| 111 |
@st.cache_resource
|
| 112 |
def load_model():
|
| 113 |
try:
|
|
|
|
| 118 |
st.warning(f"Could not load trained artefacts: {exc}")
|
| 119 |
return None, None
|
| 120 |
|
|
|
|
| 121 |
clf, le = load_model()
|
| 122 |
topk = None
|
| 123 |
if clf and le is not None:
|
|
|
|
| 132 |
)
|
| 133 |
st.caption("The slider above only changes how many ranked predictions you see.")
|
| 134 |
|
| 135 |
+
file = st.file_uploader("Upload WAV/MP3/M4A", type=["wav", "mp3", "m4a"])
|
| 136 |
|
| 137 |
if file and clf and le is not None:
|
| 138 |
data = file.read()
|
|
|
|
| 141 |
saved_path = UPLOAD_DIR / renamed_name
|
| 142 |
saved_path.write_bytes(data)
|
| 143 |
st.caption(f"Saved a copy as `{saved_path}`.")
|
| 144 |
+
|
| 145 |
+
# Robust librosa load: in-memory first, fall back to temp file for odd formats
|
| 146 |
try:
|
| 147 |
y, sr = librosa.load(io.BytesIO(data), sr=16000, mono=True)
|
| 148 |
except Exception:
|
|
|
|
| 151 |
tmp.write(data)
|
| 152 |
tmp.flush()
|
| 153 |
y, sr = librosa.load(tmp.name, sr=16000, mono=True)
|
| 154 |
+
|
| 155 |
raw_y = y.copy()
|
| 156 |
rms = np.sqrt(np.mean(raw_y**2)) + 1e-8
|
| 157 |
scale = estimate_scale(raw_y, sr)
|
| 158 |
+
|
| 159 |
+
# Simple RMS normalization to a modest level
|
| 160 |
+
y = raw_y * (0.05 / rms)
|
| 161 |
+
|
| 162 |
+
# Features -> classifier
|
| 163 |
feats = extract_features(y, 16000).reshape(1, -1)
|
| 164 |
proba = clf.predict_proba(feats)[0]
|
| 165 |
idx = np.argsort(proba)[::-1]
|
| 166 |
+
|
| 167 |
st.subheader("Prediction")
|
| 168 |
if scale:
|
| 169 |
st.write(f"Estimated scale: **{scale}** (experimental)")
|
| 170 |
else:
|
| 171 |
st.write("Scale detection: the clip lacked clear musical content, so no scale estimate.")
|
| 172 |
st.write(f"Input loudness (RMS): {20 * np.log10(rms + 1e-12):.1f} dBFS")
|
| 173 |
+
|
| 174 |
limit = topk or 3
|
| 175 |
for i in idx[:limit]:
|
| 176 |
label = le.classes_[i]
|
| 177 |
st.write(f"{describe_label(label)} — **{proba[i] * 100:.1f}%**")
|
| 178 |
+
|
| 179 |
+
# Probability bar chart
|
| 180 |
friendly_index = [describe_label(label) for label in le.classes_]
|
| 181 |
st.bar_chart(pd.Series(proba, index=friendly_index))
|
| 182 |
|
| 183 |
+
# ---------- Visual explanation ----------
|
| 184 |
with st.expander("How the model listens", expanded=False):
|
| 185 |
st.markdown(
|
| 186 |
+
"We tidy the audio (level it, pull out key frequencies) and let the classifier score that summary. "
|
| 187 |
+
"These charts show the raw waveform and the energy heatmap the model uses to decide."
|
| 188 |
)
|
| 189 |
|
| 190 |
duration = raw_y.size / sr if raw_y.size else 0
|
| 191 |
+
times = (
|
| 192 |
+
np.linspace(0.0, duration, num=raw_y.size, endpoint=False)
|
| 193 |
+
if raw_y.size
|
| 194 |
+
else np.array([])
|
| 195 |
+
)
|
| 196 |
|
| 197 |
+
# Waveform
|
| 198 |
fig_wave, ax_wave = plt.subplots(figsize=(6, 2))
|
| 199 |
if raw_y.size:
|
| 200 |
ax_wave.plot(times, raw_y, linewidth=0.8, color="#1f77b4")
|
|
|
|
| 206 |
st.pyplot(fig_wave, use_container_width=True)
|
| 207 |
plt.close(fig_wave)
|
| 208 |
|
| 209 |
+
# Log-mel spectrogram
|
| 210 |
mel = librosa.feature.melspectrogram(y=raw_y, sr=sr, n_fft=2048, hop_length=512, n_mels=64)
|
| 211 |
mel_db = librosa.power_to_db(mel, ref=np.max) if mel.size else mel
|
| 212 |
fig_spec, ax_spec = plt.subplots(figsize=(6, 3))
|
| 213 |
if mel.size:
|
| 214 |
+
img = librosa.display.specshow(
|
| 215 |
+
mel_db, sr=sr, hop_length=512, x_axis="time", y_axis="mel", ax=ax_spec
|
| 216 |
+
)
|
| 217 |
cbar = fig_spec.colorbar(img, ax=ax_spec, format="%+2.f dB")
|
| 218 |
cbar.set_label("Energy (dB)")
|
| 219 |
ax_spec.set_title("Log-mel spectrogram (what the model summarises)")
|
| 220 |
st.pyplot(fig_spec, use_container_width=True)
|
| 221 |
plt.close(fig_spec)
|
| 222 |
+
|
| 223 |
elif file and not clf:
|
| 224 |
st.warning("No trained model found. Run `python train.py` first.")
|