| import os |
| import torch |
| import librosa |
| import numpy as np |
| import gradio as gr |
| import pyopenjtalk |
| from util import preprocess_input, postprocess_phn, get_tokenizer, load_pitch_dict, get_pinyin |
|
|
| from espnet_model_zoo.downloader import ModelDownloader |
| from espnet2.bin.svs_inference import SingingGenerate |
|
|
|
|
| singer_embeddings = { |
| "Model①(Chinese)-zh": { |
| "singer1 (male)": 1, |
| "singer2 (female)": 12, |
| "singer3 (male)": 23, |
| "singer4 (female)": 29, |
| "singer5 (male)": 18, |
| "singer6 (female)": 8, |
| "singer7 (male)": 25, |
| "singer8 (female)": 5, |
| "singer9 (male)": 10, |
| "singer10 (female)": 15, |
| }, |
| "Model②(Multilingual)-zh": { |
| "singer1 (male)": "resource/singer/singer_embedding_ace-1.npy", |
| "singer2 (female)": "resource/singer/singer_embedding_ace-2.npy", |
| "singer3 (male)": "resource/singer/singer_embedding_ace-3.npy", |
| "singer4 (female)": "resource/singer/singer_embedding_ace-8.npy", |
| "singer5 (male)": "resource/singer/singer_embedding_ace-7.npy", |
| "singer6 (female)": "resource/singer/singer_embedding_itako.npy", |
| "singer7 (male)": "resource/singer/singer_embedding_ofuton.npy", |
| "singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy", |
| "singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy", |
| "singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy", |
| }, |
| "Model②(Multilingual)-jp": { |
| "singer1 (male)": "resource/singer/singer_embedding_ace-1.npy", |
| "singer2 (female)": "resource/singer/singer_embedding_ace-2.npy", |
| "singer3 (male)": "resource/singer/singer_embedding_ace-3.npy", |
| "singer4 (female)": "resource/singer/singer_embedding_ace-8.npy", |
| "singer5 (male)": "resource/singer/singer_embedding_ace-7.npy", |
| "singer6 (female)": "resource/singer/singer_embedding_itako.npy", |
| "singer7 (male)": "resource/singer/singer_embedding_ofuton.npy", |
| "singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy", |
| "singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy", |
| "singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy", |
| }, |
| } |
|
|
| model_dict = { |
| "Model①(Chinese)-zh": "espnet/aceopencpop_svs_visinger2_40singer_pretrain", |
| "Model②(Multilingual)-zh": "espnet/mixdata_svs_visinger2_spkembed_lang_pretrained", |
| "Model②(Multilingual)-jp": "espnet/mixdata_svs_visinger2_spkembed_lang_pretrained", |
| } |
|
|
| total_singers = list(singer_embeddings["Model②(Multilingual)-zh"].keys()) |
|
|
| langs = { |
| "zh": 2, |
| "jp": 1, |
| } |
|
|
| predictor = torch.hub.load("South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True) |
| exist_model = "Null" |
| svs = None |
|
|
| def gen_song(model_name, spk, texts, durs, pitchs): |
| fs = 44100 |
| tempo = 120 |
| lang = model_name.split("-")[-1] |
| PRETRAIN_MODEL = model_dict[model_name] |
| if texts is None: |
| return (fs, np.array([0.0])), "Error: No Text provided!" |
| if durs is None: |
| return (fs, np.array([0.0])), "Error: No Dur provided!" |
| if pitchs is None: |
| return (fs, np.array([0.0])), "Error: No Pitch provided!" |
|
|
| |
| if lang == "zh": |
| texts = preprocess_input(texts, "") |
| text_list = get_pinyin(texts) |
| elif lang == "jp": |
| texts = preprocess_input(texts, " ") |
| text_list = texts.strip().split() |
| durs = preprocess_input(durs, " ") |
| dur_list = durs.strip().split() |
| pitchs = preprocess_input(pitchs, " ") |
| pitch_list = pitchs.strip().split() |
|
|
| if len(text_list) != len(dur_list): |
| return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with duration({len(dur_list)})!" |
| if len(text_list) != len(pitch_list): |
| return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!" |
|
|
| |
| tokenizer = get_tokenizer(model_name, lang) |
| sybs = [] |
| for text in text_list: |
| if text == "AP" or text == "SP": |
| rev = [text] |
| elif text == "-" or text == "——": |
| rev = [text] |
| else: |
| rev = tokenizer(text) |
| if rev == False: |
| return (fs, np.array([0.0])), f"Error: text `{text}` is invalid!" |
| rev = postprocess_phn(rev, model_name, lang) |
| phns = "_".join(rev) |
| sybs.append(phns) |
|
|
| pitch_dict = load_pitch_dict() |
|
|
| labels = [] |
| notes = [] |
| st = 0 |
| pre_phn = "" |
| for phns, dur, pitch in zip(sybs, dur_list, pitch_list): |
| if phns == "-" or phns == "——": |
| phns = pre_phn |
| if pitch not in pitch_dict: |
| return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!" |
| pitch = pitch_dict[pitch] |
| phn_list = phns.split("_") |
| lyric = "".join(phn_list) |
| dur = float(dur) |
| note = [st, st + dur, lyric, pitch, phns] |
| st += dur |
| notes.append(note) |
| for phn in phn_list: |
| labels.append(phn) |
| pre_phn = labels[-1] |
|
|
| phns_str = " ".join(labels) |
| batch = { |
| "score": ( |
| int(tempo), |
| notes, |
| ), |
| "text": phns_str, |
| } |
| print(batch) |
| |
|
|
| |
| global exist_model |
| global svs |
| if exist_model == "Null" or exist_model != model_name: |
| device = "cpu" |
| |
| d = ModelDownloader() |
| pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL) |
| svs = SingingGenerate( |
| train_config = pretrain_downloaded["train_config"], |
| model_file = pretrain_downloaded["model_file"], |
| device = device |
| ) |
| exist_model = model_name |
| if model_name == "Model①(Chinese)-zh": |
| sid = np.array([singer_embeddings[model_name][spk]]) |
| output_dict = svs(batch, sids=sid) |
| else: |
| lid = np.array([langs[lang]]) |
| spk_embed = np.load(singer_embeddings[model_name][spk]) |
| output_dict = svs(batch, lids=lid, spembs=spk_embed) |
| wav_info = output_dict["wav"].cpu().numpy() |
|
|
| |
| global predictor |
| wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000) |
| wav_mos = torch.from_numpy(wav_mos).unsqueeze(0) |
| len_mos = torch.tensor([wav_mos.shape[1]]) |
| score = predictor(wav_mos, len_mos) |
| return (fs, wav_info), "success!", round(score.item(), 2) |
|
|
|
|
| |
| examples = [ |
| ["Model①(Chinese)-zh", "singer1 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0"], |
| ["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest"], |
| ["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C#4 D#4 D#4 D#4 rest D#4 B3 rest\nB3 B3 rest B3 B3 E4 rest"], |
| ["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 大 地 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest"], |
| ["Model②(Multilingual)-zh", "singer3 (male)", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0"], |
| ["Model②(Multilingual)-zh", "singer3 (male)", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.66 0.58 0.27 0.3 0.97\n0.48 0.36 0.69 0.3 0.53 0.56 1.27 0.89", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0"], |
| ["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP\n你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34\n0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0\n63 63 63 63 0 63\n62 62 62 63 65 63 62 0"], |
| ["Model①(Chinese)-zh", "singer3 (male)", "修 炼 爱 情 的 心 酸 SP AP", "0.42 0.21 0.19 0.28 0.22 0.33 1.53 0.1 0.29", "68 70 68 66 63 68 68 0 0"], |
| ["Model①(Chinese)-zh", "singer3 (male)", "学 会 放 好 以 前 的 渴 望 SP AP", "0.3 0.22 0.29 0.27 0.25 0.44 0.54 0.29 1.03 0.08 0.39", "68 70 68 66 61 68 68 65 66 0 0"], |
| ["Model①(Chinese)-zh", "singer3 (male)", "SP 我 不 - 是 一 定 要 你 回 - 来 SP", "0.37 0.45 0.47 0.17 0.52 0.28 0.46 0.31 0.44 0.45 0.2 2.54 0.19", "0 51 60 61 59 59 57 57 59 60 61 59 0"], |
| ["Model①(Chinese)-zh", "singer4 (female)", "AP 我 多 想 再 见 你\n哪 怕 匆 - 匆 一 AP 眼 就 别 离 AP", "0.13 0.24 0.68 0.78 0.86 0.4 0.94 0.54 0.3 0.56 0.16 0.86 0.26 0.22 0.28 0.78 0.68 1.5 0.32", "0 57 66 63 63 63 63 60 61 61 63 66 66 0 61 61 59 58 0"], |
| ["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "60 60 60 56 56 56 55 55 55 53 56"], |
| ["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "62 62 62 58 58 58 57 57 57 55 58"], |
| ["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "1.2 0.6 0.6 0.6 0.6 1.2 1.2 0.6 0.6 1.2 0.45", "60 60 60 56 56 56 55 55 55 53 56"], |
| ["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "0.3 0.15 0.15 0.15 0.15 0.3 0.3 0.15 0.15 0.3 0.11", "60 60 60 56 56 56 55 55 55 53 56"], |
| ["Model②(Multilingual)-jp", "singer8 (female)", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0"], |
| ["Model②(Multilingual)-jp", "singer8 (female)", "じゃ の め で お む か え う れ し い な", "0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.65", "60 60 60 62 64 67 69 69 64 64 64 62 60"], |
| ["Model②(Multilingual)-jp", "singer10 (female)", "お と め わ ら い か ふぁ い や ら い か ん な い す ぶ ろ うぃ ん ぶ ろ うぃ ん い ん ざ うぃ ん", "0.15 0.15 0.15 0.15 0.3 0.15 0.3 0.15 0.15 0.3 0.07 0.07 0.15 0.15 0.15 0.15 0.15 0.15 0.45 0.07 0.07 0.07 0.38 0.07 0.07 0.15 0.15 0.3 0.15 0.15", "67 67 67 67 67 67 69 67 67 69 67 67 64 64 64 64 64 64 62 64 64 62 62 64 64 62 62 59 59 59"], |
| ] |
|
|
| with gr.Blocks() as demo: |
| gr.Markdown( |
| """ |
| <h1 align="center"> Demo of Singing Voice Synthesis in Muskits-ESPnet </h1> |
| |
| <div style="font-size: 20px;"> |
| This is the demo page of our toolkit <a href="https://arxiv.org/abs/2409.07226"><b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b></a>. |
| |
| Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer. |
| |
| Music score usually includes lyrics, as well as duration and pitch of each word in lyrics, |
| |
| <h2>How to use:</h2> |
| <ol> |
| <li><b>Choose Model-Language</b>: |
| <ul> |
| <li>Choose "zh" for Chinese lyrics input or "jp" for Japanese lyrics input.</li> |
| <li>For example, "Model②(Mulitlingual)-zh" means model "Model②(Multilingual)" with lyrics input in Chinese.</li> |
| </ul> |
| </li> |
| <li><b>[Optional] Choose Singer</b>: Choose a singer from the drop-down menu.</li> |
| <li><b>Input lyrics</b>: |
| <ul> |
| <li>Input Chinese characters for "zh" and hiragana for "jp".</li> |
| <li>You may include special symbols: 'AP' for breath, 'SP' for silence, and '-' for slur (Chinese lyrics only).</li> |
| <li>Separate each lyric by either a space (' ') or a newline ('\\n') (no quotation marks needed).</li> |
| </ul> |
| </li> |
| <li><b>Input durations</b>: |
| <ul> |
| <li>Input durations as float numbers.</li> |
| <li>The durations sequence should <b>match the lyric sequence in length</b>, with each duration aligned to a lyric.</li> |
| <li>Separate each duration by a space (' ') or a newline ('\\n') (no quotation marks needed).</li> |
| </ul> |
| </li> |
| <li><b>Input pitches</b>: |
| <ul> |
| <li>Input MIDI note names or MIDI note numbers (e.g., MIDI note name "69" represents the MIDI note number "A4", and others follow accordingly).</li> |
| <li>The pitch sequence should <b>match the lyric sequence in length</b>, with each pitch corresponding to a lyric.</li> |
| <li>Separate each duration by a space (' ') or a newline ('\\n') (no quotation marks needed).</li> |
| </ul> |
| </li> |
| <li><b>Hit "Generate" and listen</b>: |
| <ul> |
| <li>"Running Status" shows the status of singing generatation. If any error exists, it will show the error information.</li> |
| <li>"Pseudo MOS" represents predicted mean opinion score for the generated song.</li> |
| </ul> |
| </li> |
| </ol> |
| </div> |
| |
| <h2>Notice:</h2> |
| <ul> |
| <li> Plenty of exmpales are provided. </li> |
| <li> Extreme values may result in suboptimal generation quality! </li> |
| </ul> |
| """ |
| ) |
| |
| with gr.Row(): |
| with gr.Column(variant="panel"): |
| model_name = gr.Radio( |
| label="Model-Language", |
| choices=[ |
| "Model①(Chinese)-zh", |
| "Model②(Multilingual)-zh", |
| "Model②(Multilingual)-jp", |
| ], |
| ) |
|
|
| with gr.Column(variant="panel"): |
| singer = gr.Dropdown( |
| label="Singer", |
| choices=total_singers, |
| ) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
| with gr.Row(): |
| with gr.Column(variant="panel"): |
| lyrics = gr.Textbox(label="Lyrics") |
| duration = gr.Textbox(label="Duration") |
| pitch = gr.Textbox(label="Pitch") |
| generate = gr.Button("Generate") |
| with gr.Column(variant="panel"): |
| gened_song = gr.Audio(label="Generated Song", type="numpy") |
| run_status = gr.Textbox(label="Running Status") |
| pred_mos = gr.Textbox(label=" Pseudo MOS") |
|
|
| gr.Examples( |
| examples=examples, |
| inputs=[model_name, singer, lyrics, duration, pitch], |
| outputs=[singer], |
| label="Examples", |
| examples_per_page=20, |
| ) |
| |
| gr.Markdown(""" |
| <div style='margin:20px auto;'> |
| |
| <p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> | |
| <a href="https://github.com/espnet/espnet">espnet</a> | |
| <a href="https://huggingface.co/espnet/aceopencpop_svs_visinger2_40singer_pretrain">Model①(Chinese)</a> | |
| <a href="https://huggingface.co/espnet/mixdata_svs_visinger2_spkembed_lang_pretrained">Model②(Multilingual)</a> | |
| <a href="https://github.com/South-Twilight/SingMOS">SingMOS</a></p> |
| |
| </div> |
| """ |
| ) |
|
|
| generate.click( |
| fn=gen_song, |
| inputs=[model_name, singer, lyrics, duration, pitch], |
| outputs=[gened_song, run_status, pred_mos], |
| ) |
| |
| demo.launch() |
|
|