connaaa commited on
Commit
8d73c70
·
verified ·
1 Parent(s): 87a9a88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -30
app.py CHANGED
@@ -2,7 +2,9 @@ import io
2
  import os
3
  import tempfile
4
  from pathlib import Path
 
5
 
 
6
  BASE_DIR = Path(__file__).resolve().parent
7
  CACHE_ROOT = BASE_DIR / ".cache"
8
  NUMBA_CACHE_DIR = CACHE_ROOT / "numba"
@@ -12,14 +14,15 @@ for cache_dir in (NUMBA_CACHE_DIR, MPL_CACHE_DIR):
12
  os.environ.setdefault("NUMBA_CACHE_DIR", str(NUMBA_CACHE_DIR))
13
  os.environ.setdefault("MPLCONFIGDIR", str(MPL_CACHE_DIR))
14
 
 
15
  import joblib
16
  import numpy as np
17
  import pandas as pd
18
  import streamlit as st
19
 
20
- if not st.runtime.exists():.
 
21
  from streamlit.web import cli as stcli
22
-
23
  import sys
24
 
25
  port = os.environ.get("PORT", "7860")
@@ -37,39 +40,30 @@ if not st.runtime.exists():.
37
  ]
38
  sys.exit(stcli.main())
39
 
 
40
  import librosa
41
  import librosa.display
42
  import matplotlib.pyplot as plt
43
 
 
44
  from features import extract_features
45
  from devices import describe_label
46
 
47
-
48
  NOTE_NAMES = [
49
- "C",
50
- "C#",
51
- "D",
52
- "D#",
53
- "E",
54
- "F",
55
- "F#",
56
- "G",
57
- "G#",
58
- "A",
59
- "A#",
60
- "B",
61
  ]
62
  MAJOR_PROFILE = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
63
  MINOR_PROFILE = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
 
64
  UPLOAD_DIR = BASE_DIR / "uploads"
65
  MODEL_PATH = BASE_DIR / "models" / "model.pkl"
66
  ENCODER_PATH = BASE_DIR / "models" / "label_encoder.pkl"
67
- UPLOAD_DIR.mkdir(exist_ok=True)
68
-
69
 
70
- def estimate_scale(y: np.ndarray, sr: int) -> str | None:
71
  """Return a rough musical scale (e.g., 'C major') or None if unclear."""
72
-
73
  if y.size == 0:
74
  return None
75
  chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
@@ -98,6 +92,7 @@ def estimate_scale(y: np.ndarray, sr: int) -> str | None:
98
  return f"{NOTE_NAMES[best_major]} major"
99
  return f"{NOTE_NAMES[best_minor]} minor"
100
 
 
101
  st.set_page_config(page_title="Mic-ID (MVP)", layout="centered")
102
  st.title("Mic-ID (MVP)")
103
  st.caption("Upload ~5s audio - guess the recording device")
@@ -105,15 +100,14 @@ st.caption("Upload ~5s audio - guess the recording device")
105
  with st.expander("Training data & devices", expanded=False):
106
  st.markdown(
107
  """
108
- - **TAU Urban Acoustic Scenes 2019 Mobile**: 295 parallel scenes where the same moment was captured on three devices – Zoom F8 (device A, clips ending in `-a`), Samsung Galaxy S7 (device B, `-b`), and iPhone SE (device C, `-c`). We only keep folders containing a full `-a/-b/-c` triplet, so each mic has 295 clips.
109
- - **Local additions**: 4 laptop and 4 iPhone recordings collected with `utils.py` to anchor the classifier on in-house gear.
110
- - **Features & model**: log-mel + MFCC statistics flow into a histogram-based gradient boosting classifier tuned for this small balanced set.
111
 
112
- Want more coverage? Record new clips under `data/<device>/` or export outtakes with `scripts/export_outtakes.py` before retraining via `python train.py`.
113
  """
114
  )
115
 
116
-
117
  @st.cache_resource
118
  def load_model():
119
  try:
@@ -124,7 +118,6 @@ def load_model():
124
  st.warning(f"Could not load trained artefacts: {exc}")
125
  return None, None
126
 
127
-
128
  clf, le = load_model()
129
  topk = None
130
  if clf and le is not None:
@@ -139,7 +132,7 @@ if clf and le is not None:
139
  )
140
  st.caption("The slider above only changes how many ranked predictions you see.")
141
 
142
- file = st.file_uploader("Upload WAV/MP3/M4A", type=["wav","mp3","m4a"])
143
 
144
  if file and clf and le is not None:
145
  data = file.read()
@@ -148,6 +141,8 @@ if file and clf and le is not None:
148
  saved_path = UPLOAD_DIR / renamed_name
149
  saved_path.write_bytes(data)
150
  st.caption(f"Saved a copy as `{saved_path}`.")
 
 
151
  try:
152
  y, sr = librosa.load(io.BytesIO(data), sr=16000, mono=True)
153
  except Exception:
@@ -156,34 +151,50 @@ if file and clf and le is not None:
156
  tmp.write(data)
157
  tmp.flush()
158
  y, sr = librosa.load(tmp.name, sr=16000, mono=True)
 
159
  raw_y = y.copy()
160
  rms = np.sqrt(np.mean(raw_y**2)) + 1e-8
161
  scale = estimate_scale(raw_y, sr)
162
- y = raw_y * (0.05 / rms) # simple RMS norm
 
 
 
 
163
  feats = extract_features(y, 16000).reshape(1, -1)
164
  proba = clf.predict_proba(feats)[0]
165
  idx = np.argsort(proba)[::-1]
 
166
  st.subheader("Prediction")
167
  if scale:
168
  st.write(f"Estimated scale: **{scale}** (experimental)")
169
  else:
170
  st.write("Scale detection: the clip lacked clear musical content, so no scale estimate.")
171
  st.write(f"Input loudness (RMS): {20 * np.log10(rms + 1e-12):.1f} dBFS")
 
172
  limit = topk or 3
173
  for i in idx[:limit]:
174
  label = le.classes_[i]
175
  st.write(f"{describe_label(label)} — **{proba[i] * 100:.1f}%**")
 
 
176
  friendly_index = [describe_label(label) for label in le.classes_]
177
  st.bar_chart(pd.Series(proba, index=friendly_index))
178
 
 
179
  with st.expander("How the model listens", expanded=False):
180
  st.markdown(
181
- "We tidy the audio (level it, pull out key frequencies) and let the classifier score that summary. These charts show the raw waveform and the energy heatmap the model uses to decide."
 
182
  )
183
 
184
  duration = raw_y.size / sr if raw_y.size else 0
185
- times = np.linspace(0.0, duration, num=raw_y.size, endpoint=False) if raw_y.size else np.array([])
 
 
 
 
186
 
 
187
  fig_wave, ax_wave = plt.subplots(figsize=(6, 2))
188
  if raw_y.size:
189
  ax_wave.plot(times, raw_y, linewidth=0.8, color="#1f77b4")
@@ -195,15 +206,19 @@ if file and clf and le is not None:
195
  st.pyplot(fig_wave, use_container_width=True)
196
  plt.close(fig_wave)
197
 
 
198
  mel = librosa.feature.melspectrogram(y=raw_y, sr=sr, n_fft=2048, hop_length=512, n_mels=64)
199
  mel_db = librosa.power_to_db(mel, ref=np.max) if mel.size else mel
200
  fig_spec, ax_spec = plt.subplots(figsize=(6, 3))
201
  if mel.size:
202
- img = librosa.display.specshow(mel_db, sr=sr, hop_length=512, x_axis="time", y_axis="mel", ax=ax_spec)
 
 
203
  cbar = fig_spec.colorbar(img, ax=ax_spec, format="%+2.f dB")
204
  cbar.set_label("Energy (dB)")
205
  ax_spec.set_title("Log-mel spectrogram (what the model summarises)")
206
  st.pyplot(fig_spec, use_container_width=True)
207
  plt.close(fig_spec)
 
208
  elif file and not clf:
209
  st.warning("No trained model found. Run `python train.py` first.")
 
2
  import os
3
  import tempfile
4
  from pathlib import Path
5
+ from typing import Optional
6
 
7
+ # ---------- Paths & caches ----------
8
  BASE_DIR = Path(__file__).resolve().parent
9
  CACHE_ROOT = BASE_DIR / ".cache"
10
  NUMBA_CACHE_DIR = CACHE_ROOT / "numba"
 
14
  os.environ.setdefault("NUMBA_CACHE_DIR", str(NUMBA_CACHE_DIR))
15
  os.environ.setdefault("MPLCONFIGDIR", str(MPL_CACHE_DIR))
16
 
17
+ # ---------- Core deps ----------
18
  import joblib
19
  import numpy as np
20
  import pandas as pd
21
  import streamlit as st
22
 
23
+ # If launched as `python app.py`, re-exec under `streamlit run ...`
24
+ if not st.runtime.exists():
25
  from streamlit.web import cli as stcli
 
26
  import sys
27
 
28
  port = os.environ.get("PORT", "7860")
 
40
  ]
41
  sys.exit(stcli.main())
42
 
43
+ # ---------- Audio & plotting ----------
44
  import librosa
45
  import librosa.display
46
  import matplotlib.pyplot as plt
47
 
48
+ # ---------- Local modules ----------
49
  from features import extract_features
50
  from devices import describe_label
51
 
52
+ # ---------- Music key estimation helpers ----------
53
  NOTE_NAMES = [
54
+ "C", "C#", "D", "D#", "E", "F",
55
+ "F#", "G", "G#", "A", "A#", "B",
 
 
 
 
 
 
 
 
 
 
56
  ]
57
  MAJOR_PROFILE = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
58
  MINOR_PROFILE = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])
59
+
60
  UPLOAD_DIR = BASE_DIR / "uploads"
61
  MODEL_PATH = BASE_DIR / "models" / "model.pkl"
62
  ENCODER_PATH = BASE_DIR / "models" / "label_encoder.pkl"
63
+ UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
 
64
 
65
+ def estimate_scale(y: np.ndarray, sr: int) -> Optional[str]:
66
  """Return a rough musical scale (e.g., 'C major') or None if unclear."""
 
67
  if y.size == 0:
68
  return None
69
  chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
 
92
  return f"{NOTE_NAMES[best_major]} major"
93
  return f"{NOTE_NAMES[best_minor]} minor"
94
 
95
+ # ---------- UI ----------
96
  st.set_page_config(page_title="Mic-ID (MVP)", layout="centered")
97
  st.title("Mic-ID (MVP)")
98
  st.caption("Upload ~5s audio - guess the recording device")
 
100
  with st.expander("Training data & devices", expanded=False):
101
  st.markdown(
102
  """
103
+ - **TAU Urban Acoustic Scenes 2019 Mobile**: 295 parallel scenes where the same moment was captured on three devices – Zoom F8 (device A, clips ending in `-a`), Samsung Galaxy S7 (device B, `-b`), and iPhone SE (device C, `-c`). We only keep folders containing a full `-a/-b/-c` triplet, so each mic has 295 clips.
104
+ - **Local additions**: 4 laptop and 4 iPhone recordings collected with `utils.py` to anchor the classifier on in-house gear.
105
+ - **Features & model**: log-mel + MFCC statistics flow into a histogram-based gradient boosting classifier tuned for this small balanced set.
106
 
107
+ Want more coverage? Record new clips under `data/<device>/` or export outtakes with `scripts/export_outtakes.py` before retraining via `python train.py`.
108
  """
109
  )
110
 
 
111
  @st.cache_resource
112
  def load_model():
113
  try:
 
118
  st.warning(f"Could not load trained artefacts: {exc}")
119
  return None, None
120
 
 
121
  clf, le = load_model()
122
  topk = None
123
  if clf and le is not None:
 
132
  )
133
  st.caption("The slider above only changes how many ranked predictions you see.")
134
 
135
+ file = st.file_uploader("Upload WAV/MP3/M4A", type=["wav", "mp3", "m4a"])
136
 
137
  if file and clf and le is not None:
138
  data = file.read()
 
141
  saved_path = UPLOAD_DIR / renamed_name
142
  saved_path.write_bytes(data)
143
  st.caption(f"Saved a copy as `{saved_path}`.")
144
+
145
+ # Robust librosa load: in-memory first, fall back to temp file for odd formats
146
  try:
147
  y, sr = librosa.load(io.BytesIO(data), sr=16000, mono=True)
148
  except Exception:
 
151
  tmp.write(data)
152
  tmp.flush()
153
  y, sr = librosa.load(tmp.name, sr=16000, mono=True)
154
+
155
  raw_y = y.copy()
156
  rms = np.sqrt(np.mean(raw_y**2)) + 1e-8
157
  scale = estimate_scale(raw_y, sr)
158
+
159
+ # Simple RMS normalization to a modest level
160
+ y = raw_y * (0.05 / rms)
161
+
162
+ # Features -> classifier
163
  feats = extract_features(y, 16000).reshape(1, -1)
164
  proba = clf.predict_proba(feats)[0]
165
  idx = np.argsort(proba)[::-1]
166
+
167
  st.subheader("Prediction")
168
  if scale:
169
  st.write(f"Estimated scale: **{scale}** (experimental)")
170
  else:
171
  st.write("Scale detection: the clip lacked clear musical content, so no scale estimate.")
172
  st.write(f"Input loudness (RMS): {20 * np.log10(rms + 1e-12):.1f} dBFS")
173
+
174
  limit = topk or 3
175
  for i in idx[:limit]:
176
  label = le.classes_[i]
177
  st.write(f"{describe_label(label)} — **{proba[i] * 100:.1f}%**")
178
+
179
+ # Probability bar chart
180
  friendly_index = [describe_label(label) for label in le.classes_]
181
  st.bar_chart(pd.Series(proba, index=friendly_index))
182
 
183
+ # ---------- Visual explanation ----------
184
  with st.expander("How the model listens", expanded=False):
185
  st.markdown(
186
+ "We tidy the audio (level it, pull out key frequencies) and let the classifier score that summary. "
187
+ "These charts show the raw waveform and the energy heatmap the model uses to decide."
188
  )
189
 
190
  duration = raw_y.size / sr if raw_y.size else 0
191
+ times = (
192
+ np.linspace(0.0, duration, num=raw_y.size, endpoint=False)
193
+ if raw_y.size
194
+ else np.array([])
195
+ )
196
 
197
+ # Waveform
198
  fig_wave, ax_wave = plt.subplots(figsize=(6, 2))
199
  if raw_y.size:
200
  ax_wave.plot(times, raw_y, linewidth=0.8, color="#1f77b4")
 
206
  st.pyplot(fig_wave, use_container_width=True)
207
  plt.close(fig_wave)
208
 
209
+ # Log-mel spectrogram
210
  mel = librosa.feature.melspectrogram(y=raw_y, sr=sr, n_fft=2048, hop_length=512, n_mels=64)
211
  mel_db = librosa.power_to_db(mel, ref=np.max) if mel.size else mel
212
  fig_spec, ax_spec = plt.subplots(figsize=(6, 3))
213
  if mel.size:
214
+ img = librosa.display.specshow(
215
+ mel_db, sr=sr, hop_length=512, x_axis="time", y_axis="mel", ax=ax_spec
216
+ )
217
  cbar = fig_spec.colorbar(img, ax=ax_spec, format="%+2.f dB")
218
  cbar.set_label("Energy (dB)")
219
  ax_spec.set_title("Log-mel spectrogram (what the model summarises)")
220
  st.pyplot(fig_spec, use_container_width=True)
221
  plt.close(fig_spec)
222
+
223
  elif file and not clf:
224
  st.warning("No trained model found. Run `python train.py` first.")