| import numpy as np |
| import librosa |
| import librosa.display |
| import torch |
| import torchaudio |
| import torchaudio.transforms as T |
| from pydub import AudioSegment |
| from transformers import pipeline |
| import scipy.signal |
| from speechbrain.pretrained import EncoderClassifier |
|
|
| |
| def load_audio(file_path, target_sr=16000): |
| y, sr = librosa.load(file_path, sr=target_sr) |
| return y, sr |
|
|
| |
| def reduce_noise(y, sr): |
| reduced_noise = scipy.signal.medfilt(y, kernel_size=3) |
| return reduced_noise |
|
|
| |
| def speech_to_text(file_path): |
| asr_model = pipeline("automatic-speech-recognition", model="openai/whisper-small") |
| transcript = asr_model(file_path) |
| return transcript["text"] |
|
|
| |
| def detect_emotion(file_path): |
| classifier = EncoderClassifier.from_hparams(source="speechbrain/emotion-recognition-wav2vec2", |
| savedir="pretrained_models/emotion-recognition") |
| signal, sr = torchaudio.load(file_path) |
| emotion = classifier.classify_batch(signal) |
| return emotion[3] |
|
|
| |
| def classify_sound(file_path): |
| classifier = EncoderClassifier.from_hparams(source="speechbrain/sound-classification", |
| savedir="pretrained_models/sound-classification") |
| signal, sr = torchaudio.load(file_path) |
| category = classifier.classify_batch(signal) |
| return category[3] |
|
|
| |
| def save_audio(y, sr, output_path): |
| librosa.output.write_wav(output_path, y, sr) |
|
|
| |
| def extract_features(file_path): |
| y, sr = librosa.load(file_path, sr=16000) |
| mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) |
| chroma = librosa.feature.chroma_stft(y=y, sr=sr) |
| mel_spec = librosa.feature.melspectrogram(y=y, sr=sr) |
| return {"mfccs": mfccs, "chroma": chroma, "mel_spec": mel_spec} |