| import streamlit as st |
| import torch |
| import torchaudio |
| |
| from speechbrain.inference.speaker import EncoderClassifier |
| from speechbrain.inference.enhancement import SpectralMaskEnhancement |
| from speechbrain.inference.classifiers import AudioClassifier |
| import os |
| from sklearn.metrics.pairwise import cosine_similarity |
| import numpy as np |
|
|
| |
|
|
| @st.cache_resource |
| def load_models(): |
| """Memuat model verifikasi speaker dan KWS.""" |
| |
| spk_model = EncoderClassifier.from_hparams( |
| source="speechbrain/spkrec-xvect-voxceleb", |
| savedir="pretrained_models/spkrec-xvect-voxceleb" |
| ) |
| |
| |
| |
| kws_model = AudioClassifier.from_hparams( |
| source="speechbrain/google_speech_command_xvector", |
| savedir="pretrained_models/google_speech_command_xvector" |
| ) |
| |
| |
| enhancer = SpectralMaskEnhancement.from_hparams( |
| source="speechbrain/metricgan-plus-voicebank", |
| savedir="pretrained_models/metricgan-plus-voicebank" |
| ) |
| return spk_model, kws_model, enhancer |
|
|
| |
| spk_model, kws_model, enhancer = load_models() |
|
|
| |
| ENROLL_DIR = "enroll/" |
| THRESHOLD = 0.85 |
|
|
| |
|
|
| def preprocess_audio(wav_file): |
| """Memuat, membersihkan, dan mengubah sample rate audio.""" |
| try: |
| |
| sig, fs = torchaudio.load(wav_file) |
|
|
| |
| if enhancer: |
| enhanced_sig = enhancer.enhance_batch(sig, lengths=torch.tensor([sig.shape[1]])) |
| sig = enhanced_sig.squeeze(0) |
|
|
| |
| if fs != 16000: |
| resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000) |
| sig = resampler(sig) |
| |
| return sig |
| except Exception as e: |
| st.error(f"Error memproses audio: {e}") |
| return None |
|
|
| @st.cache_data |
| def get_enrollment_embeddings(): |
| """ |
| Membuat embedding (sidik jari suara) rata-rata |
| untuk setiap pengguna di folder /enroll. |
| """ |
| enrollment_data = {} |
| if not os.path.exists(ENROLL_DIR): |
| st.warning(f"Folder '{ENROLL_DIR}' tidak ditemukan.") |
| return {} |
|
|
| for speaker_name in os.listdir(ENROLL_DIR): |
| speaker_dir = os.path.join(ENROLL_DIR, speaker_name) |
| if os.path.isdir(speaker_dir): |
| embeddings = [] |
| for wav_file in os.listdir(speaker_dir): |
| if wav_file.endswith(".wav"): |
| wav_path = os.path.join(speaker_dir, wav_file) |
| try: |
| sig, fs = torchaudio.load(wav_path) |
| if fs != 16000: |
| resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000) |
| sig = resampler(sig) |
| |
| |
| with torch.no_grad(): |
| emb = spk_model.encode_batch(sig) |
| emb = emb.squeeze() |
| embeddings.append(emb.numpy()) |
| except Exception as e: |
| st.error(f"Gagal memproses {wav_path}: {e}") |
| |
| if embeddings: |
| |
| enrollment_data[speaker_name] = np.mean(embeddings, axis=0) |
| |
| return enrollment_data |
|
|
| |
| st.title("Sistem Verifikasi Perintah Suara π") |
| st.write("Unggah file .wav untuk verifikasi.") |
|
|
| |
| enrollment_embeddings = get_enrollment_embeddings() |
|
|
| if not enrollment_embeddings: |
| st.error("Tidak ada data pendaftaran yang ditemukan. Pastikan folder 'enroll' ada dan berisi file .wav.") |
| else: |
| st.success(f"Berhasil memuat data pendaftaran untuk: {list(enrollment_embeddings.keys())}") |
|
|
| uploaded_file = st.file_uploader("Pilih file audio...", type=["wav"]) |
|
|
| if uploaded_file is not None: |
| st.audio(uploaded_file, format="audio/wav") |
| |
| if st.button("Verifikasi Sekarang"): |
| with st.spinner("Memproses audio..."): |
| signal = preprocess_audio(uploaded_file) |
| |
| if signal is not None: |
| |
| st.subheader("Tahap 1: Verifikasi Speaker") |
| |
| with torch.no_grad(): |
| upload_embedding = spk_model.encode_batch(signal).squeeze().numpy() |
| |
| best_score = 0 |
| best_match = "Tidak Dikenali" |
| |
| |
| for speaker_name, enrolled_emb in enrollment_embeddings.items(): |
| score = cosine_similarity( |
| upload_embedding.reshape(1, -1), |
| enrolled_emb.reshape(1, -1) |
| )[0][0] |
| |
| st.write(f"Skor kemiripan dengan {speaker_name}: **{score:.2f}**") |
| |
| if score > best_score: |
| best_score = score |
| best_match = speaker_name |
|
|
| |
| if best_score > THRESHOLD: |
| st.success(f"β
**Akses Diberikan**: Dikenali sebagai **{best_match}** (Skor: {best_score:.2f})") |
| |
| |
| st.subheader("Tahap 2: Deteksi Perintah") |
| with st.spinner("Mendeteksi perintah..."): |
| with torch.no_grad(): |
| |
| prediction = kws_model.classify_batch(signal) |
| |
| |
| |
| |
| top_prob = torch.max(prediction[0]).item() |
| top_label = prediction[3][0] |
|
|
| |
| |
| |
| |
| st.write(f"Perintah terdeteksi: **{top_label}** (Keyakinan: {top_prob:.2f})") |
|
|
| if top_label.lower() == "up": |
| st.balloons() |
| st.success(f"π **Perintah Diterima**: `{best_match}` berkata 'BUKA'.") |
| elif top_label.lower() == "down": |
| st.success(f"π **Perintah Diterima**: `{best_match}` berkata 'TUTUP'.") |
| else: |
| st.warning(f"Perintah '{top_label}' tidak dikenali sebagai 'Buka' atau 'Tutup'.") |
|
|
| else: |
| st.error(f"β **Akses Ditolak**: Suara tidak dikenali (Skor tertinggi: {best_score:.2f})") |