| import gradio as gr |
| import os |
| import json |
| import math |
| import torch |
| from torch import nn |
| from torch.nn import functional as F |
| import librosa |
| import argparse |
| import librosa.display |
| import matplotlib.pyplot as plt |
|
|
|
|
| import commons |
| import utils |
| from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate |
| |
| from models_mel_style import SynthesizerTrn |
| from text.symbols import symbols |
| from text import text_to_sequence |
| from mel_processing import spectrogram_torch, spec_to_mel_torch |
| from scipy.io.wavfile import write |
|
|
| |
| AUDIO_DIR = "wav/wav_1" |
|
|
| def list_wav_files(): |
| return [f for f in os.listdir(AUDIO_DIR) if f.endswith(".wav")] |
|
|
| |
| def get_audio_file(file_name): |
| file_path = os.path.join(AUDIO_DIR, file_name) |
| return file_path |
|
|
| def get_text(text, hps): |
| text_norm = text_to_sequence(text, hps.data.text_cleaners) |
| if hps.data.add_blank: |
| text_norm = commons.intersperse(text_norm, 0) |
| text_norm = torch.LongTensor(text_norm) |
| return text_norm |
|
|
| |
| def generate_voice(prompt_text, ref_audio_filename): |
| import argparse |
| class Args: |
| checkpoint_path = "logs/large_audio/G_504000.pth" |
| config = "configs/vn_base.json" |
| save_path = "infer_result/" |
| ref_audio = os.path.join("wav/wav_1", ref_audio_filename) |
| text = prompt_text |
| args = Args() |
|
|
| hps = utils.get_hparams_from_file(args.config) |
| net_g = SynthesizerTrn( |
| len(symbols), |
| hps.data.filter_length // 2 + 1, |
| hps.train.segment_size // hps.data.hop_length, |
| n_speakers=0, |
| **hps.model |
| ) |
| _ = net_g.eval() |
| _ = utils.load_checkpoint(args.checkpoint_path, net_g, None) |
|
|
| audio, _ = librosa.load(args.ref_audio, sr=hps.data.sampling_rate) |
| audio = torch.from_numpy(audio).unsqueeze(0) |
| spec = spectrogram_torch(audio, hps.data.filter_length, hps.data.sampling_rate, |
| hps.data.hop_length, hps.data.win_length, center=False) |
| spec = torch.squeeze(spec, 0) |
| mel = spec_to_mel_torch(spec, hps.data.filter_length, hps.data.n_mel_channels, |
| hps.data.sampling_rate, hps.data.mel_fmin, hps.data.mel_fmax) |
|
|
| stn_tst = get_text(args.text, hps) |
| x_tst = stn_tst.unsqueeze(0) |
| x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) |
| sid = torch.LongTensor([4]) |
|
|
| with torch.no_grad(): |
| audio_gen = net_g.infer(x_tst, x_tst_lengths, mel.unsqueeze(0), |
| sid=None, noise_scale=0.1, |
| noise_scale_w=0.1, length_scale=1.1)[0][0, 0].data.cpu().float().numpy() |
|
|
| os.makedirs(args.save_path, exist_ok=True) |
| output_file = os.path.join(args.save_path, f'test_{str(len(os.listdir(args.save_path)))}.wav') |
| write(output_file, hps.data.sampling_rate, audio_gen) |
|
|
| return output_file |
|
|
| |
| |
|
|
| |
|
|
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
|
|
| |
|
|
| |
|
|
| |
| |
|
|
| with gr.Blocks() as demo: |
| gr.Markdown("<center># <h1>Demo Model Text to Speech</h1></center>") |
|
|
| prompt = gr.Textbox(label="Prompt", placeholder="Type something here...") |
|
|
| gr.Markdown("## 🎧 Chọn hoặc ghi âm giọng nói tham chiếu") |
|
|
| with gr.Tab("📁 Chọn từ file"): |
| wav_files = sorted(list_wav_files()) |
| file_dropdown = gr.Dropdown(choices=wav_files, label="Chọn file WAV có sẵn") |
| audio_output = gr.Audio(type="filepath", label="Nghe tại đây") |
| file_dropdown.change(fn=get_audio_file, inputs=file_dropdown, outputs=audio_output) |
|
|
| with gr.Tab("🎙️ Ghi âm mới"): |
| recorded_audio = gr.Audio(label="Ghi âm hoặc chọn file", type="filepath") |
|
|
| |
| generate_button = gr.Button("Generate Voice") |
| generated_audio_output = gr.Audio(type="filepath", label="🔊 Kết quả sinh giọng nói") |
|
|
| def process_inputs(prompt_text, file_choice, recorded_path): |
| |
| if recorded_path is not None: |
| filename = f"user_recording_{len(os.listdir(AUDIO_DIR))}.wav" |
| saved_path = os.path.join(AUDIO_DIR, filename) |
| os.rename(recorded_path, saved_path) |
| ref_file = filename |
| elif file_choice: |
| ref_file = file_choice |
| else: |
| raise gr.Error("Bạn cần chọn hoặc ghi âm một file giọng nói.") |
|
|
| return generate_voice(prompt_text, ref_file) |
|
|
| generate_button.click( |
| fn=process_inputs, |
| inputs=[prompt, file_dropdown, recorded_audio], |
| outputs=generated_audio_output |
| ) |
|
|
| demo.launch() |
|
|