File size: 1,445 Bytes
de9c0fe
 
 
 
b6c1b75
de9c0fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import numpy as np, librosa


def load_mono(path, sr=16000):
    path = str(path)
    x, sr = librosa.load(path, sr=sr, mono=True)
    x, _ = librosa.effects.trim(x, top_db=30)
    rms = np.sqrt(np.mean(x**2)) + 1e-8
    x = x * (0.05 / rms)  # simple RMS target ≈ loudness norm
    return x, sr


def extract_features(x, sr=16000, n_mels=64, n_mfcc=20):
    S = librosa.feature.melspectrogram(y=x, sr=sr, n_mels=n_mels)
    logmel = librosa.power_to_db(S + 1e-9)
    logmel_stats = np.hstack([logmel.mean(axis=1), logmel.std(axis=1)])

    mfcc = librosa.feature.mfcc(S=librosa.power_to_db(S + 1e-9), sr=sr, n_mfcc=n_mfcc)
    frames = mfcc.shape[1]
    width = min(9, frames if frames % 2 else frames - 1)
    if width < 3:
        d1 = np.zeros_like(mfcc)
        d2 = np.zeros_like(mfcc)
    else:
        d1 = librosa.feature.delta(mfcc, width=width)
        d2 = librosa.feature.delta(mfcc, width=width, order=2)
    mfcc_stats = np.hstack([mfcc.mean(axis=1), mfcc.std(axis=1),
                            d1.mean(axis=1), d1.std(axis=1),
                            d2.mean(axis=1), d2.std(axis=1)])

    zcr = librosa.feature.zero_crossing_rate(x).mean()
    centroid = librosa.feature.spectral_centroid(y=x, sr=sr).mean()
    rolloff = librosa.feature.spectral_rolloff(y=x, sr=sr).mean()
    flatness = librosa.feature.spectral_flatness(y=x).mean()
    return np.hstack([logmel_stats, mfcc_stats, [zcr, centroid, rolloff, flatness]])