mic-id / app.py
connaaa's picture
Update app.py
8d73c70 verified
import io
import os
import tempfile
from pathlib import Path
from typing import Optional
# ---------- Paths & caches ----------
BASE_DIR = Path(__file__).resolve().parent
CACHE_ROOT = BASE_DIR / ".cache"
NUMBA_CACHE_DIR = CACHE_ROOT / "numba"
MPL_CACHE_DIR = CACHE_ROOT / "matplotlib"
for cache_dir in (NUMBA_CACHE_DIR, MPL_CACHE_DIR):
cache_dir.mkdir(parents=True, exist_ok=True)
os.environ.setdefault("NUMBA_CACHE_DIR", str(NUMBA_CACHE_DIR))
os.environ.setdefault("MPLCONFIGDIR", str(MPL_CACHE_DIR))
# ---------- Core deps ----------
import joblib
import numpy as np
import pandas as pd
import streamlit as st
# If launched as `python app.py`, re-exec under `streamlit run ...`
if not st.runtime.exists():
from streamlit.web import cli as stcli
import sys
port = os.environ.get("PORT", "7860")
address = os.environ.get("HOST", "0.0.0.0")
sys.argv = [
"streamlit",
"run",
__file__,
"--server.port",
port,
"--server.address",
address,
"--server.headless",
"true",
]
sys.exit(stcli.main())
# ---------- Audio & plotting ----------
import librosa
import librosa.display
import matplotlib.pyplot as plt
# ---------- Local modules ----------
from features import extract_features
from devices import describe_label
# ---------- Music key estimation helpers ----------
NOTE_NAMES = [
"C", "C#", "D", "D#", "E", "F",
"F#", "G", "G#", "A", "A#", "B",
]
MAJOR_PROFILE = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
MINOR_PROFILE = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
UPLOAD_DIR = BASE_DIR / "uploads"
MODEL_PATH = BASE_DIR / "models" / "model.pkl"
ENCODER_PATH = BASE_DIR / "models" / "label_encoder.pkl"
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
def estimate_scale(y: np.ndarray, sr: int) -> Optional[str]:
"""Return a rough musical scale (e.g., 'C major') or None if unclear."""
if y.size == 0:
return None
chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
if chroma.size == 0:
return None
chroma_mean = chroma.mean(axis=1)
norm = np.linalg.norm(chroma_mean, ord=1)
if norm == 0:
return None
chroma_mean = chroma_mean / norm
major_scores = [float(np.dot(chroma_mean, np.roll(MAJOR_PROFILE, i))) for i in range(12)]
minor_scores = [float(np.dot(chroma_mean, np.roll(MINOR_PROFILE, i))) for i in range(12)]
best_major = int(np.argmax(major_scores))
best_minor = int(np.argmax(minor_scores))
best_major_score = major_scores[best_major]
best_minor_score = minor_scores[best_minor]
best_score = max(best_major_score, best_minor_score)
# Require a minimal tonal structure to avoid spurious guesses on noise.
if best_score < 0.3:
return None
if best_major_score >= best_minor_score:
return f"{NOTE_NAMES[best_major]} major"
return f"{NOTE_NAMES[best_minor]} minor"
# ---------- UI ----------
st.set_page_config(page_title="Mic-ID (MVP)", layout="centered")
st.title("Mic-ID (MVP)")
st.caption("Upload ~5s audio - guess the recording device")
with st.expander("Training data & devices", expanded=False):
st.markdown(
"""
- **TAU Urban Acoustic Scenes 2019 Mobile**: 295 parallel scenes where the same moment was captured on three devices – Zoom F8 (device A, clips ending in `-a`), Samsung Galaxy S7 (device B, `-b`), and iPhone SE (device C, `-c`). We only keep folders containing a full `-a/-b/-c` triplet, so each mic has 295 clips.
- **Local additions**: 4 laptop and 4 iPhone recordings collected with `utils.py` to anchor the classifier on in-house gear.
- **Features & model**: log-mel + MFCC statistics flow into a histogram-based gradient boosting classifier tuned for this small balanced set.
Want more coverage? Record new clips under `data/<device>/` or export outtakes with `scripts/export_outtakes.py` before retraining via `python train.py`.
"""
)
@st.cache_resource
def load_model():
try:
clf = joblib.load(MODEL_PATH)
le = joblib.load(ENCODER_PATH)
return clf, le
except Exception as exc:
st.warning(f"Could not load trained artefacts: {exc}")
return None, None
clf, le = load_model()
topk = None
if clf and le is not None:
max_classes = max(1, len(le.classes_))
default_topk = min(3, max_classes)
topk = st.slider(
"How many guesses should we list?",
min_value=1,
max_value=max_classes,
value=default_topk,
help="Slide right to show more of the lower-confidence device guesses.",
)
st.caption("The slider above only changes how many ranked predictions you see.")
file = st.file_uploader("Upload WAV/MP3/M4A", type=["wav", "mp3", "m4a"])
if file and clf and le is not None:
data = file.read()
original_name = Path(file.name or "upload").name
renamed_name = f"hooks - {original_name}"
saved_path = UPLOAD_DIR / renamed_name
saved_path.write_bytes(data)
st.caption(f"Saved a copy as `{saved_path}`.")
# Robust librosa load: in-memory first, fall back to temp file for odd formats
try:
y, sr = librosa.load(io.BytesIO(data), sr=16000, mono=True)
except Exception:
suffix = os.path.splitext(file.name or "upload")[1] or ".wav"
with tempfile.NamedTemporaryFile(suffix=suffix) as tmp:
tmp.write(data)
tmp.flush()
y, sr = librosa.load(tmp.name, sr=16000, mono=True)
raw_y = y.copy()
rms = np.sqrt(np.mean(raw_y**2)) + 1e-8
scale = estimate_scale(raw_y, sr)
# Simple RMS normalization to a modest level
y = raw_y * (0.05 / rms)
# Features -> classifier
feats = extract_features(y, 16000).reshape(1, -1)
proba = clf.predict_proba(feats)[0]
idx = np.argsort(proba)[::-1]
st.subheader("Prediction")
if scale:
st.write(f"Estimated scale: **{scale}** (experimental)")
else:
st.write("Scale detection: the clip lacked clear musical content, so no scale estimate.")
st.write(f"Input loudness (RMS): {20 * np.log10(rms + 1e-12):.1f} dBFS")
limit = topk or 3
for i in idx[:limit]:
label = le.classes_[i]
st.write(f"{describe_label(label)} — **{proba[i] * 100:.1f}%**")
# Probability bar chart
friendly_index = [describe_label(label) for label in le.classes_]
st.bar_chart(pd.Series(proba, index=friendly_index))
# ---------- Visual explanation ----------
with st.expander("How the model listens", expanded=False):
st.markdown(
"We tidy the audio (level it, pull out key frequencies) and let the classifier score that summary. "
"These charts show the raw waveform and the energy heatmap the model uses to decide."
)
duration = raw_y.size / sr if raw_y.size else 0
times = (
np.linspace(0.0, duration, num=raw_y.size, endpoint=False)
if raw_y.size
else np.array([])
)
# Waveform
fig_wave, ax_wave = plt.subplots(figsize=(6, 2))
if raw_y.size:
ax_wave.plot(times, raw_y, linewidth=0.8, color="#1f77b4")
ax_wave.set_xlim(0, max(times) if raw_y.size else 0)
ax_wave.set_title("Waveform (time vs amplitude)")
ax_wave.set_xlabel("Time (s)")
ax_wave.set_ylabel("Amplitude")
ax_wave.grid(alpha=0.2)
st.pyplot(fig_wave, use_container_width=True)
plt.close(fig_wave)
# Log-mel spectrogram
mel = librosa.feature.melspectrogram(y=raw_y, sr=sr, n_fft=2048, hop_length=512, n_mels=64)
mel_db = librosa.power_to_db(mel, ref=np.max) if mel.size else mel
fig_spec, ax_spec = plt.subplots(figsize=(6, 3))
if mel.size:
img = librosa.display.specshow(
mel_db, sr=sr, hop_length=512, x_axis="time", y_axis="mel", ax=ax_spec
)
cbar = fig_spec.colorbar(img, ax=ax_spec, format="%+2.f dB")
cbar.set_label("Energy (dB)")
ax_spec.set_title("Log-mel spectrogram (what the model summarises)")
st.pyplot(fig_spec, use_container_width=True)
plt.close(fig_spec)
elif file and not clf:
st.warning("No trained model found. Run `python train.py` first.")