mvsepless_plugins / medley_vox.py
noblebarkrr's picture
Upload 2 files
7108f39 verified
import os
import time
from datetime import datetime
import shutil
import sys
import json
import gradio as gr
from model_list import medley_vox_models
from utils.download_models import download_model
from assets.translations import MVSEPLESS_TRANSLATIONS as TRANSLATIONS
PRETRAIN_FILE = os.sep.join([os.getcwd(), "separator", "medley_vox", "pretrained_models", "xlsr_53_56k.pt"])
if os.path.exists(PRETRAIN_FILE) == False:
os.system(f"wget -O {PRETRAIN_FILE} https://huggingface.co/Sucial/MedleyVox-Inference-WebUI/resolve/main/pretrained/xlsr_53_56k.pt?download=true")
CURRENT_LANG = "ru"
MODELS_CACHE_DIR = os.path.join(os.getcwd(), os.path.join("separator", "models_cache"))
OUTPUT_FORMATS = ["mp3", "wav", "flac", "ogg", "opus", "m4a", "aac", "aiff"]
OUTPUT_DIR = "/content/output"
def set_language(lang):
global CURRENT_LANG
CURRENT_LANG = lang
def t(key, **kwargs):
"""Функция для получения перевода с подстановкой значений"""
translation = TRANSLATIONS[CURRENT_LANG].get(key, key)
return translation.format(**kwargs) if kwargs else translation
def medley_voxer(input, output, model_name, output_format, stereo_mode):
config_url = medley_vox_models[model_name]["config_url"]
checkpoint_url = medley_vox_models[model_name]["checkpoint_url"]
medley_vox_model_dir = download_model(MODELS_CACHE_DIR, model_name, "medley_vox", checkpoint_url, config_url)
command = (
f"python -m separator.medley_vox.svs.inference "
f"--inference_data_dir '{input}' "
f"--results_save_dir '{output}' "
f"--model_dir '{medley_vox_model_dir}' "
f"--exp_name {model_name} "
f"--use_overlapadd=ola "
f"--stereo '{stereo_mode}' "
f"--output_format {output_format} "
)
os.system(command)
results_path = os.path.join(output, "results.json")
if os.path.exists(results_path):
with open(results_path) as f:
return json.load(f)
return []
def medley_voxer_gradio(input, output, model_name, output_format, stereo_mode):
output_audio = medley_voxer(input, output, model_name, output_format, stereo_mode)
results = []
if output_audio is not None:
for i, (stem, output_file) in enumerate(output_audio[:2]):
results.append(gr.update(
visible=True,
label=stem,
value=output_file
))
return tuple(results)
##############
def multi_voxer(input, output, model_name, output_format, stereo_mode, stems):
output_audio = medley_voxer(input, output, model_name, output_format, stereo_mode) # primary stems
results = []
if stems == 2:
return output_audio
if stems == 4:
for stem, file in output_audio:
voxes = medley_voxer(file, output, model_name, output_format, stereo_mode)
results.extend(voxes)
print(results)
return results
if stems == 8:
for stem, file in output_audio:
voxes = medley_voxer(file, output, model_name, output_format, stereo_mode)
for stem2, file2 in voxes:
voxes2 = medley_voxer(file2, output, model_name, output_format, stereo_mode)
results.extend(voxes2)
print(results)
return results
if stems == 16:
for stem, file in output_audio:
voxes = medley_voxer(file, output, model_name, output_format, stereo_mode)
for stem2, file2 in voxes:
voxes2 = medley_voxer(file2, output, model_name, output_format, stereo_mode)
for stem3, file3 in voxes2:
voxes3 = medley_voxer(file3, output, model_name, output_format, stereo_mode)
results.extend(voxes3)
print(results)
return results
##############
def multi_voxer_gradio(input, output, model_name, output_format, stereo_mode, stems):
output_audio = multi_voxer(input, output, model_name, output_format, stereo_mode, stems)
batch_names = []
if output_audio is not None:
for i, (stem, output_file) in enumerate(output_audio[:20]):
batch_names.append(gr.update(
visible=True,
label=stem,
value=output_file
))
# Заполняем оставшиеся слоты невидимыми элементами
while len(batch_names) < 20:
batch_names.append(gr.update(visible=False, label=None, value=None))
return tuple(batch_names)
def medley_vox_plugin_name():
return "Medley-Vox"
def medley_vox_plugin(lang):
set_language(lang)
output_dir = gr.Text(value="/content/output/", visible=False)
with gr.Tab(t("inference")):
with gr.Row(equal_height=True):
with gr.Column():
input_voice = gr.Audio(show_label=False, type="filepath", interactive=True)
with gr.Column():
vox_model_name = gr.Dropdown(label=t("vox_model_name"), choices=list(medley_vox_models.keys()), value=list(medley_vox_models.keys())[0], interactive=True, filterable=False)
stereo_mode = gr.Dropdown(label=t("vox_stereo_mode"), choices=["mono", "full"], value="mono", interactive=True, filterable=False)
output_vox_format = gr.Dropdown(label=t("vox_output_format"), choices=list(filter(lambda fmt: fmt != "ogg", OUTPUT_FORMATS)), value="mp3", interactive=True, filterable=False)
separate_vox_btn = gr.Button(t("separate_vocals_btn"), variant="primary")
output_voxes = [gr.Audio(visible=(i == 0), interactive=False, type="filepath", show_download_button=True) for i in range(2)]
with gr.Tab(t("vocal_multi_separation")):
with gr.Row(equal_height=True):
with gr.Column():
input_vox = gr.Audio(show_label=False, type="filepath", interactive=True)
with gr.Column():
vox_m_model_name = gr.Dropdown(label=t("vox_model_name"), choices=list(medley_vox_models.keys()), value=list(medley_vox_models.keys())[0], interactive=True, filterable=False)
with gr.Row():
stereo_m_mode = gr.Dropdown(label=t("vox_stereo_mode"), choices=["mono", "full"], value="mono", interactive=True, filterable=False)
count_stems = gr.Dropdown(label=t("vox_count_stems"), choices=[2, 4, 8, 16], value=2, interactive=True, filterable=False)
output_m_vox_format = gr.Dropdown(label=t("vox_output_format"), choices=list(filter(lambda fmt: fmt != "ogg", OUTPUT_FORMATS)), value="mp3", interactive=True, filterable=False)
separate_m_vox_btn = gr.Button(t("vox_multi_separate_btn"), variant="primary")
output_m_voxes = [gr.Audio(visible=(i == 0), interactive=False, type="filepath", show_download_button=True) for i in range(20)]
separate_m_vox_btn.click(fn=(lambda : os.path.join(OUTPUT_DIR, datetime.now().strftime("%Y%m%d_%H%M%S"))), inputs=None, outputs=output_dir).then(fn=multi_voxer_gradio, inputs=[input_vox, output_dir, vox_m_model_name, output_m_vox_format, stereo_m_mode, count_stems], outputs=[*output_m_voxes])
separate_vox_btn.click(fn=(lambda : os.path.join(OUTPUT_DIR, datetime.now().strftime("%Y%m%d_%H%M%S"))), inputs=None, outputs=output_dir).then(fn=medley_voxer_gradio, inputs=[input_voice, output_dir, vox_model_name, output_vox_format, stereo_mode], outputs=output_voxes)