import os import time from datetime import datetime import shutil import sys import json import gradio as gr from model_list import medley_vox_models from utils.download_models import download_model from assets.translations import MVSEPLESS_TRANSLATIONS as TRANSLATIONS PRETRAIN_FILE = os.sep.join([os.getcwd(), "separator", "medley_vox", "pretrained_models", "xlsr_53_56k.pt"]) if os.path.exists(PRETRAIN_FILE) == False: os.system(f"wget -O {PRETRAIN_FILE} https://huggingface.co/Sucial/MedleyVox-Inference-WebUI/resolve/main/pretrained/xlsr_53_56k.pt?download=true") CURRENT_LANG = "ru" MODELS_CACHE_DIR = os.path.join(os.getcwd(), os.path.join("separator", "models_cache")) OUTPUT_FORMATS = ["mp3", "wav", "flac", "ogg", "opus", "m4a", "aac", "aiff"] OUTPUT_DIR = "/content/output" def set_language(lang): global CURRENT_LANG CURRENT_LANG = lang def t(key, **kwargs): """Функция для получения перевода с подстановкой значений""" translation = TRANSLATIONS[CURRENT_LANG].get(key, key) return translation.format(**kwargs) if kwargs else translation def medley_voxer(input, output, model_name, output_format, stereo_mode): config_url = medley_vox_models[model_name]["config_url"] checkpoint_url = medley_vox_models[model_name]["checkpoint_url"] medley_vox_model_dir = download_model(MODELS_CACHE_DIR, model_name, "medley_vox", checkpoint_url, config_url) command = ( f"python -m separator.medley_vox.svs.inference " f"--inference_data_dir '{input}' " f"--results_save_dir '{output}' " f"--model_dir '{medley_vox_model_dir}' " f"--exp_name {model_name} " f"--use_overlapadd=ola " f"--stereo '{stereo_mode}' " f"--output_format {output_format} " ) os.system(command) results_path = os.path.join(output, "results.json") if os.path.exists(results_path): with open(results_path) as f: return json.load(f) return [] def medley_voxer_gradio(input, output, model_name, output_format, stereo_mode): output_audio = medley_voxer(input, output, model_name, output_format, stereo_mode) results = [] if output_audio is not None: for i, (stem, output_file) in enumerate(output_audio[:2]): results.append(gr.update( visible=True, label=stem, value=output_file )) return tuple(results) ############## def multi_voxer(input, output, model_name, output_format, stereo_mode, stems): output_audio = medley_voxer(input, output, model_name, output_format, stereo_mode) # primary stems results = [] if stems == 2: return output_audio if stems == 4: for stem, file in output_audio: voxes = medley_voxer(file, output, model_name, output_format, stereo_mode) results.extend(voxes) print(results) return results if stems == 8: for stem, file in output_audio: voxes = medley_voxer(file, output, model_name, output_format, stereo_mode) for stem2, file2 in voxes: voxes2 = medley_voxer(file2, output, model_name, output_format, stereo_mode) results.extend(voxes2) print(results) return results if stems == 16: for stem, file in output_audio: voxes = medley_voxer(file, output, model_name, output_format, stereo_mode) for stem2, file2 in voxes: voxes2 = medley_voxer(file2, output, model_name, output_format, stereo_mode) for stem3, file3 in voxes2: voxes3 = medley_voxer(file3, output, model_name, output_format, stereo_mode) results.extend(voxes3) print(results) return results ############## def multi_voxer_gradio(input, output, model_name, output_format, stereo_mode, stems): output_audio = multi_voxer(input, output, model_name, output_format, stereo_mode, stems) batch_names = [] if output_audio is not None: for i, (stem, output_file) in enumerate(output_audio[:20]): batch_names.append(gr.update( visible=True, label=stem, value=output_file )) # Заполняем оставшиеся слоты невидимыми элементами while len(batch_names) < 20: batch_names.append(gr.update(visible=False, label=None, value=None)) return tuple(batch_names) def medley_vox_plugin_name(): return "Medley-Vox" def medley_vox_plugin(lang): set_language(lang) output_dir = gr.Text(value="/content/output/", visible=False) with gr.Tab(t("inference")): with gr.Row(equal_height=True): with gr.Column(): input_voice = gr.Audio(show_label=False, type="filepath", interactive=True) with gr.Column(): vox_model_name = gr.Dropdown(label=t("vox_model_name"), choices=list(medley_vox_models.keys()), value=list(medley_vox_models.keys())[0], interactive=True, filterable=False) stereo_mode = gr.Dropdown(label=t("vox_stereo_mode"), choices=["mono", "full"], value="mono", interactive=True, filterable=False) output_vox_format = gr.Dropdown(label=t("vox_output_format"), choices=list(filter(lambda fmt: fmt != "ogg", OUTPUT_FORMATS)), value="mp3", interactive=True, filterable=False) separate_vox_btn = gr.Button(t("separate_vocals_btn"), variant="primary") output_voxes = [gr.Audio(visible=(i == 0), interactive=False, type="filepath", show_download_button=True) for i in range(2)] with gr.Tab(t("vocal_multi_separation")): with gr.Row(equal_height=True): with gr.Column(): input_vox = gr.Audio(show_label=False, type="filepath", interactive=True) with gr.Column(): vox_m_model_name = gr.Dropdown(label=t("vox_model_name"), choices=list(medley_vox_models.keys()), value=list(medley_vox_models.keys())[0], interactive=True, filterable=False) with gr.Row(): stereo_m_mode = gr.Dropdown(label=t("vox_stereo_mode"), choices=["mono", "full"], value="mono", interactive=True, filterable=False) count_stems = gr.Dropdown(label=t("vox_count_stems"), choices=[2, 4, 8, 16], value=2, interactive=True, filterable=False) output_m_vox_format = gr.Dropdown(label=t("vox_output_format"), choices=list(filter(lambda fmt: fmt != "ogg", OUTPUT_FORMATS)), value="mp3", interactive=True, filterable=False) separate_m_vox_btn = gr.Button(t("vox_multi_separate_btn"), variant="primary") output_m_voxes = [gr.Audio(visible=(i == 0), interactive=False, type="filepath", show_download_button=True) for i in range(20)] separate_m_vox_btn.click(fn=(lambda : os.path.join(OUTPUT_DIR, datetime.now().strftime("%Y%m%d_%H%M%S"))), inputs=None, outputs=output_dir).then(fn=multi_voxer_gradio, inputs=[input_vox, output_dir, vox_m_model_name, output_m_vox_format, stereo_m_mode, count_stems], outputs=[*output_m_voxes]) separate_vox_btn.click(fn=(lambda : os.path.join(OUTPUT_DIR, datetime.now().strftime("%Y%m%d_%H%M%S"))), inputs=None, outputs=output_dir).then(fn=medley_voxer_gradio, inputs=[input_voice, output_dir, vox_model_name, output_vox_format, stereo_mode], outputs=output_voxes)