audio_analyzer / spectral.py
Mr7Explorer's picture
Update spectral.py
27b0097 verified
import numpy as np
import librosa
import scipy.signal as sps
def compute_spectral_analysis(y, sr, n_fft=4096):
"""Comprehensive spectral analysis tuned for speech QC."""
hop_length = n_fft // 4
# ============================================================
# STFT → Magnitude + dB Conversion
# ============================================================
S = np.abs(librosa.stft(
y,
n_fft=n_fft,
hop_length=hop_length,
window="hann"
))
freqs = np.linspace(0, sr / 2, S.shape[0])
# Convert amplitude to dB scale
S_db = librosa.amplitude_to_db(S, ref=np.max)
# ============================================================
# 90th Percentile Energy Envelope
# ============================================================
S_power = S ** 2
energy = np.percentile(S_power, 90, axis=1) + 1e-20
total_energy = float(np.sum(energy))
cum_energy = np.cumsum(energy)
roll85_idx = np.searchsorted(cum_energy, 0.85 * total_energy)
roll95_idx = np.searchsorted(cum_energy, 0.95 * total_energy)
freq_at_85 = float(freqs[min(roll85_idx, len(freqs) - 1)])
freq_at_95 = float(freqs[min(roll95_idx, len(freqs) - 1)])
# ============================================================
# Updated HF Envelope: 90th percentile of dB
# ============================================================
mean_db_per_bin = np.percentile(S_db, 90, axis=1)
peak_db = float(np.max(S_db))
threshold_db = peak_db - 60
non_silent_bins = np.where(mean_db_per_bin > threshold_db)[0]
highest_freq = float(freqs[non_silent_bins[-1]]) if non_silent_bins.size else 0.0
# ============================================================
# Speech-Centric Band Energy Distribution
# ============================================================
def band_energy(low, high):
i1 = np.searchsorted(freqs, low)
i2 = np.searchsorted(freqs, high)
return float(100 * np.sum(energy[i1:i2]) / total_energy)
def band_energy_above(f):
idx = np.searchsorted(freqs, f)
return float(100 * np.sum(energy[idx:]) / total_energy)
energy_stats = {
"below_100hz": band_energy(0, 100),
"100_500hz": band_energy(100, 500),
"500_2khz": band_energy(500, 2000),
"2k_8khz": band_energy(2000, 8000),
"8k_12khz": band_energy(8000, 12000),
"12k_16khz": band_energy(12000, 16000),
"above_16khz": band_energy_above(16000)
}
# ============================================================
# Brick-wall Detection
# ============================================================
diffs = np.diff(mean_db_per_bin)
big_drop_idx = np.where(diffs < -20)[0]
brick_wall = bool(big_drop_idx.size)
brick_freq = float(freqs[big_drop_idx[0]]) if big_drop_idx.size else None
# ============================================================
# Spectral Notch Detection (Median-filtering)
# ============================================================
smooth = sps.medfilt(mean_db_per_bin, kernel_size=9)
minima = sps.argrelextrema(smooth, np.less)[0]
notches = []
for m in minima:
left = smooth[max(0, m - 6):m]
right = smooth[m + 1:min(len(smooth), m + 7)]
neighbor_peak = max(
left.max() if left.size else -999,
right.max() if right.size else -999
)
depth = neighbor_peak - smooth[m]
if depth >= 15 and freqs[m] > 100:
notches.append({
"freq": float(freqs[m]),
"depth_db": float(depth)
})
# ============================================================
# Additional Spectral Descriptors
# ============================================================
centroid = float(np.mean(librosa.feature.spectral_centroid(S=S, sr=sr)))
bandwidth = float(np.mean(librosa.feature.spectral_bandwidth(S=S, sr=sr)))
flatness = float(np.mean(librosa.feature.spectral_flatness(S=S)))
rolloff = float(np.mean(librosa.feature.spectral_rolloff(S=S, sr=sr)))
return {
"S_db": S_db,
"freqs": freqs,
"hop_length": hop_length,
"n_fft": n_fft,
"rolloff_85pct": freq_at_85,
"rolloff_95pct": freq_at_95,
"highest_freq_minus60db": highest_freq,
"energy_distribution": energy_stats,
"brick_wall_detected": brick_wall,
"brick_wall_freq": brick_freq,
"spectral_notches": notches,
"spectral_centroid": centroid,
"spectral_bandwidth": bandwidth,
"spectral_flatness": flatness,
"spectral_rolloff": rolloff,
"hf_env": mean_db_per_bin,
"lf_env": mean_db_per_bin[:200] if len(mean_db_per_bin) > 200 else mean_db_per_bin
}