Upload folder using huggingface_hub

6ff2047 verified over 1 year ago

26.2 kB

	import os, sys

	if sys.platform == "darwin":
	os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

	now_dir = os.getcwd()
	sys.path.append(now_dir)
	import multiprocessing


	class Harvest(multiprocessing.Process):
	def __init__(self, inp_q, opt_q):
	multiprocessing.Process.__init__(self)
	self.inp_q = inp_q
	self.opt_q = opt_q

	def run(self):
	import numpy as np, pyworld

	while 1:
	idx, x, res_f0, n_cpu, ts = self.inp_q.get()
	f0, t = pyworld.harvest(
	x.astype(np.double),
	fs=16000,
	f0_ceil=1100,
	f0_floor=50,
	frame_period=10,
	)
	res_f0[idx] = f0
	if len(res_f0.keys()) >= n_cpu:
	self.opt_q.put(ts)


	if __name__ == "__main__":
	from multiprocessing import Queue
	from queue import Empty
	import numpy as np
	import multiprocessing
	import traceback, re
	import json
	import PySimpleGUI as sg
	import sounddevice as sd
	import noisereduce as nr
	from multiprocessing import cpu_count
	import librosa, torch, time, threading
	import torch.nn.functional as F
	import torchaudio.transforms as tat
	from i18n import I18nAuto

	i18n = I18nAuto()
	device = torch.device(
	"cuda"
	if torch.cuda.is_available()
	else ("mps" if torch.backends.mps.is_available() else "cpu")
	)
	current_dir = os.getcwd()
	inp_q = Queue()
	opt_q = Queue()
	n_cpu = min(cpu_count(), 8)
	for _ in range(n_cpu):
	Harvest(inp_q, opt_q).start()
	from rvc_for_realtime import RVC

	class GUIConfig:
	def __init__(self) -> None:
	self.pth_path: str = ""
	self.index_path: str = ""
	self.pitch: int = 12
	self.samplerate: int = 40000
	self.block_time: float = 1.0 # s
	self.buffer_num: int = 1
	self.threhold: int = -30
	self.crossfade_time: float = 0.08
	self.extra_time: float = 0.04
	self.I_noise_reduce = False
	self.O_noise_reduce = False
	self.index_rate = 0.3
	self.n_cpu = min(n_cpu, 8)
	self.f0method = "harvest"

	class GUI:
	def __init__(self) -> None:
	self.config = GUIConfig()
	self.flag_vc = False

	self.launcher()

	def load(self):
	input_devices, output_devices, _, _ = self.get_devices()
	try:
	with open("values1.json", "r") as j:
	data = json.load(j)
	data["pm"] = data["f0method"] == "pm"
	data["harvest"] = data["f0method"] == "harvest"
	data["crepe"] = data["f0method"] == "crepe"
	data["rmvpe"] = data["f0method"] == "rmvpe"
	except:
	with open("values1.json", "w") as j:
	data = {
	"pth_path": " ",
	"index_path": " ",
	"sg_input_device": input_devices[sd.default.device[0]],
	"sg_output_device": output_devices[sd.default.device[1]],
	"threhold": "-45",
	"pitch": "0",
	"index_rate": "0",
	"block_time": "1",
	"crossfade_length": "0.04",
	"extra_time": "1",
	"f0method": "rmvpe",
	}
	return data

	def launcher(self):
	data = self.load()
	sg.theme("LightBlue3")
	input_devices, output_devices, _, _ = self.get_devices()
	layout = [
	[
	sg.Frame(
	title=i18n("加载模型"),
	layout=[
	[
	sg.Input(
	default_text=data.get("pth_path", ""),
	key="pth_path",
	),
	sg.FileBrowse(
	i18n("选择.pth文件"),
	initial_folder=os.path.join(os.getcwd(), "weights"),
	file_types=((". pth"),),
	),
	],
	[
	sg.Input(
	default_text=data.get("index_path", ""),
	key="index_path",
	),
	sg.FileBrowse(
	i18n("选择.index文件"),
	initial_folder=os.path.join(os.getcwd(), "logs"),
	file_types=((". index"),),
	),
	],
	],
	)
	],
	[
	sg.Frame(
	layout=[
	[
	sg.Text(i18n("输入设备")),
	sg.Combo(
	input_devices,
	key="sg_input_device",
	default_value=data.get("sg_input_device", ""),
	),
	],
	[
	sg.Text(i18n("输出设备")),
	sg.Combo(
	output_devices,
	key="sg_output_device",
	default_value=data.get("sg_output_device", ""),
	),
	],
	],
	title=i18n("音频设备(请使用同种类驱动)"),
	)
	],
	[
	sg.Frame(
	layout=[
	[
	sg.Text(i18n("响应阈值")),
	sg.Slider(
	range=(-60, 0),
	key="threhold",
	resolution=1,
	orientation="h",
	default_value=data.get("threhold", ""),
	),
	],
	[
	sg.Text(i18n("音调设置")),
	sg.Slider(
	range=(-24, 24),
	key="pitch",
	resolution=1,
	orientation="h",
	default_value=data.get("pitch", ""),
	),
	],
	[
	sg.Text(i18n("Index Rate")),
	sg.Slider(
	range=(0.0, 1.0),
	key="index_rate",
	resolution=0.01,
	orientation="h",
	default_value=data.get("index_rate", ""),
	),
	],
	[
	sg.Text(i18n("音高算法")),
	sg.Radio(
	"pm",
	"f0method",
	key="pm",
	default=data.get("pm", "") == True,
	),
	sg.Radio(
	"harvest",
	"f0method",
	key="harvest",
	default=data.get("harvest", "") == True,
	),
	sg.Radio(
	"crepe",
	"f0method",
	key="crepe",
	default=data.get("crepe", "") == True,
	),
	sg.Radio(
	"rmvpe",
	"f0method",
	key="rmvpe",
	default=data.get("rmvpe", "") == True,
	),
	],
	],
	title=i18n("常规设置"),
	),
	sg.Frame(
	layout=[
	[
	sg.Text(i18n("采样长度")),
	sg.Slider(
	range=(0.12, 2.4),
	key="block_time",
	resolution=0.03,
	orientation="h",
	default_value=data.get("block_time", ""),
	),
	],
	[
	sg.Text(i18n("harvest进程数")),
	sg.Slider(
	range=(1, n_cpu),
	key="n_cpu",
	resolution=1,
	orientation="h",
	default_value=data.get(
	"n_cpu", min(self.config.n_cpu, n_cpu)
	),
	),
	],
	[
	sg.Text(i18n("淡入淡出长度")),
	sg.Slider(
	range=(0.01, 0.15),
	key="crossfade_length",
	resolution=0.01,
	orientation="h",
	default_value=data.get("crossfade_length", ""),
	),
	],
	[
	sg.Text(i18n("额外推理时长")),
	sg.Slider(
	range=(0.05, 3.00),
	key="extra_time",
	resolution=0.01,
	orientation="h",
	default_value=data.get("extra_time", ""),
	),
	],
	[
	sg.Checkbox(i18n("输入降噪"), key="I_noise_reduce"),
	sg.Checkbox(i18n("输出降噪"), key="O_noise_reduce"),
	],
	],
	title=i18n("性能设置"),
	),
	],
	[
	sg.Button(i18n("开始音频转换"), key="start_vc"),
	sg.Button(i18n("停止音频转换"), key="stop_vc"),
	sg.Text(i18n("推理时间(ms):")),
	sg.Text("0", key="infer_time"),
	],
	]
	self.window = sg.Window("RVC - GUI", layout=layout)
	self.event_handler()

	def event_handler(self):
	while True:
	event, values = self.window.read()
	if event == sg.WINDOW_CLOSED:
	self.flag_vc = False
	exit()
	if event == "start_vc" and self.flag_vc == False:
	if self.set_values(values) == True:
	print("using_cuda:" + str(torch.cuda.is_available()))
	self.start_vc()
	settings = {
	"pth_path": values["pth_path"],
	"index_path": values["index_path"],
	"sg_input_device": values["sg_input_device"],
	"sg_output_device": values["sg_output_device"],
	"threhold": values["threhold"],
	"pitch": values["pitch"],
	"index_rate": values["index_rate"],
	"block_time": values["block_time"],
	"crossfade_length": values["crossfade_length"],
	"extra_time": values["extra_time"],
	"n_cpu": values["n_cpu"],
	"f0method": ["pm", "harvest", "crepe", "rmvpe"][
	[
	values["pm"],
	values["harvest"],
	values["crepe"],
	values["rmvpe"],
	].index(True)
	],
	}
	with open("values1.json", "w") as j:
	json.dump(settings, j)
	if event == "stop_vc" and self.flag_vc == True:
	self.flag_vc = False

	def set_values(self, values):
	if len(values["pth_path"].strip()) == 0:
	sg.popup(i18n("请选择pth文件"))
	return False
	if len(values["index_path"].strip()) == 0:
	sg.popup(i18n("请选择index文件"))
	return False
	pattern = re.compile("[^\x00-\x7F]+")
	if pattern.findall(values["pth_path"]):
	sg.popup(i18n("pth文件路径不可包含中文"))
	return False
	if pattern.findall(values["index_path"]):
	sg.popup(i18n("index文件路径不可包含中文"))
	return False
	self.set_devices(values["sg_input_device"], values["sg_output_device"])
	self.config.pth_path = values["pth_path"]
	self.config.index_path = values["index_path"]
	self.config.threhold = values["threhold"]
	self.config.pitch = values["pitch"]
	self.config.block_time = values["block_time"]
	self.config.crossfade_time = values["crossfade_length"]
	self.config.extra_time = values["extra_time"]
	self.config.I_noise_reduce = values["I_noise_reduce"]
	self.config.O_noise_reduce = values["O_noise_reduce"]
	self.config.index_rate = values["index_rate"]
	self.config.n_cpu = values["n_cpu"]
	self.config.f0method = ["pm", "harvest", "crepe", "rmvpe"][
	[
	values["pm"],
	values["harvest"],
	values["crepe"],
	values["rmvpe"],
	].index(True)
	]
	return True

	def start_vc(self):
	torch.cuda.empty_cache()
	self.flag_vc = True
	self.rvc = RVC(
	self.config.pitch,
	self.config.pth_path,
	self.config.index_path,
	self.config.index_rate,
	self.config.n_cpu,
	inp_q,
	opt_q,
	device,
	)
	self.config.samplerate = self.rvc.tgt_sr
	self.config.crossfade_time = min(
	self.config.crossfade_time, self.config.block_time
	)
	self.block_frame = int(self.config.block_time * self.config.samplerate)
	self.crossfade_frame = int(
	self.config.crossfade_time * self.config.samplerate
	)
	self.sola_search_frame = int(0.01 * self.config.samplerate)
	self.extra_frame = int(self.config.extra_time * self.config.samplerate)
	self.zc = self.rvc.tgt_sr // 100
	self.input_wav: np.ndarray = np.zeros(
	int(
	np.ceil(
	(
	self.extra_frame
	+ self.crossfade_frame
	+ self.sola_search_frame
	+ self.block_frame
	)
	/ self.zc
	)
	* self.zc
	),
	dtype="float32",
	)
	self.output_wav_cache: torch.Tensor = torch.zeros(
	int(
	np.ceil(
	(
	self.extra_frame
	+ self.crossfade_frame
	+ self.sola_search_frame
	+ self.block_frame
	)
	/ self.zc
	)
	* self.zc
	),
	device=device,
	dtype=torch.float32,
	)
	self.pitch: np.ndarray = np.zeros(
	self.input_wav.shape[0] // self.zc,
	dtype="int32",
	)
	self.pitchf: np.ndarray = np.zeros(
	self.input_wav.shape[0] // self.zc,
	dtype="float64",
	)
	self.output_wav: torch.Tensor = torch.zeros(
	self.block_frame, device=device, dtype=torch.float32
	)
	self.sola_buffer: torch.Tensor = torch.zeros(
	self.crossfade_frame, device=device, dtype=torch.float32
	)
	self.fade_in_window: torch.Tensor = torch.linspace(
	0.0, 1.0, steps=self.crossfade_frame, device=device, dtype=torch.float32
	)
	self.fade_out_window: torch.Tensor = 1 - self.fade_in_window
	self.resampler = tat.Resample(
	orig_freq=self.config.samplerate, new_freq=16000, dtype=torch.float32
	).to(device)
	thread_vc = threading.Thread(target=self.soundinput)
	thread_vc.start()

	def soundinput(self):
	"""
	接受音频输入
	"""
	channels = 1 if sys.platform == "darwin" else 2
	with sd.Stream(
	channels=channels,
	callback=self.audio_callback,
	blocksize=self.block_frame,
	samplerate=self.config.samplerate,
	dtype="float32",
	):
	while self.flag_vc:
	time.sleep(self.config.block_time)
	print("Audio block passed.")
	print("ENDing VC")

	def audio_callback(
	self, indata: np.ndarray, outdata: np.ndarray, frames, times, status
	):
	"""
	音频处理
	"""
	start_time = time.perf_counter()
	indata = librosa.to_mono(indata.T)
	if self.config.I_noise_reduce:
	indata[:] = nr.reduce_noise(y=indata, sr=self.config.samplerate)
	"""noise gate"""
	frame_length = 2048
	hop_length = 1024
	rms = librosa.feature.rms(
	y=indata, frame_length=frame_length, hop_length=hop_length
	)
	if self.config.threhold > -60:
	db_threhold = (
	librosa.amplitude_to_db(rms, ref=1.0)[0] < self.config.threhold
	)
	for i in range(db_threhold.shape[0]):
	if db_threhold[i]:
	indata[i * hop_length : (i + 1) * hop_length] = 0
	self.input_wav[:] = np.append(self.input_wav[self.block_frame :], indata)
	# infer
	inp = torch.from_numpy(self.input_wav).to(device)
	##0
	res1 = self.resampler(inp)
	###55%
	rate1 = self.block_frame / (
	self.extra_frame
	+ self.crossfade_frame
	+ self.sola_search_frame
	+ self.block_frame
	)
	rate2 = (
	self.crossfade_frame + self.sola_search_frame + self.block_frame
	) / (
	self.extra_frame
	+ self.crossfade_frame
	+ self.sola_search_frame
	+ self.block_frame
	)
	res2 = self.rvc.infer(
	res1,
	res1[-self.block_frame :].cpu().numpy(),
	rate1,
	rate2,
	self.pitch,
	self.pitchf,
	self.config.f0method,
	)
	self.output_wav_cache[-res2.shape[0] :] = res2
	infer_wav = self.output_wav_cache[
	-self.crossfade_frame - self.sola_search_frame - self.block_frame :
	]
	# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
	cor_nom = F.conv1d(
	infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame],
	self.sola_buffer[None, None, :],
	)
	cor_den = torch.sqrt(
	F.conv1d(
	infer_wav[
	None, None, : self.crossfade_frame + self.sola_search_frame
	]
	** 2,
	torch.ones(1, 1, self.crossfade_frame, device=device),
	)
	+ 1e-8
	)
	if sys.platform == "darwin":
	cor_nom = cor_nom.cpu()
	cor_den = cor_den.cpu()
	sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
	print("sola offset: " + str(int(sola_offset)))
	self.output_wav[:] = infer_wav[sola_offset : sola_offset + self.block_frame]
	self.output_wav[: self.crossfade_frame] *= self.fade_in_window
	self.output_wav[: self.crossfade_frame] += self.sola_buffer[:]
	# crossfade
	if sola_offset < self.sola_search_frame:
	self.sola_buffer[:] = (
	infer_wav[
	-self.sola_search_frame
	- self.crossfade_frame
	+ sola_offset : -self.sola_search_frame
	+ sola_offset
	]
	* self.fade_out_window
	)
	else:
	self.sola_buffer[:] = (
	infer_wav[-self.crossfade_frame :] * self.fade_out_window
	)
	if self.config.O_noise_reduce:
	if sys.platform == "darwin":
	noise_reduced_signal = nr.reduce_noise(
	y=self.output_wav[:].cpu().numpy(), sr=self.config.samplerate
	)
	outdata[:] = noise_reduced_signal[:, np.newaxis]
	else:
	outdata[:] = np.tile(
	nr.reduce_noise(
	y=self.output_wav[:].cpu().numpy(),
	sr=self.config.samplerate,
	),
	(2, 1),
	).T
	else:
	if sys.platform == "darwin":
	outdata[:] = self.output_wav[:].cpu().numpy()[:, np.newaxis]
	else:
	outdata[:] = self.output_wav[:].repeat(2, 1).t().cpu().numpy()
	total_time = time.perf_counter() - start_time
	self.window["infer_time"].update(int(total_time * 1000))
	print("infer time:" + str(total_time))

	def get_devices(self, update: bool = True):
	"""获取设备列表"""
	if update:
	sd._terminate()
	sd._initialize()
	devices = sd.query_devices()
	hostapis = sd.query_hostapis()
	for hostapi in hostapis:
	for device_idx in hostapi["devices"]:
	devices[device_idx]["hostapi_name"] = hostapi["name"]
	input_devices = [
	f"{d['name']} ({d['hostapi_name']})"
	for d in devices
	if d["max_input_channels"] > 0
	]
	output_devices = [
	f"{d['name']} ({d['hostapi_name']})"
	for d in devices
	if d["max_output_channels"] > 0
	]
	input_devices_indices = [
	d["index"] if "index" in d else d["name"]
	for d in devices
	if d["max_input_channels"] > 0
	]
	output_devices_indices = [
	d["index"] if "index" in d else d["name"]
	for d in devices
	if d["max_output_channels"] > 0
	]
	return (
	input_devices,
	output_devices,
	input_devices_indices,
	output_devices_indices,
	)

	def set_devices(self, input_device, output_device):
	"""设置输出设备"""
	(
	input_devices,
	output_devices,
	input_device_indices,
	output_device_indices,
	) = self.get_devices()
	sd.default.device[0] = input_device_indices[
	input_devices.index(input_device)
	]
	sd.default.device[1] = output_device_indices[
	output_devices.index(output_device)
	]
	print("input device:" + str(sd.default.device[0]) + ":" + str(input_device))
	print(
	"output device:" + str(sd.default.device[1]) + ":" + str(output_device)
	)

	gui = GUI()