IndexTTS

Runtime error

App Files Files Community

IndexTTS / webui.py

svjack

Update webui.py

04dacae verified 10 months ago

raw

history blame contribute delete

6.81 kB

	'''
	pip install datasets soundfile huggingface_hub librosa


	from datasets import load_dataset
	import soundfile as sf
	import os
	from collections import defaultdict
	import io

	def download_voices_with_dynamic_counting(output_folder='genshin_voices_sample_5', max_files_per_speaker=5):
	"""动态统计并下载所有speaker的音频和转录文件（兼容bytes/path格式）"""
	# 加载数据集（流式模式）
	dataset = load_dataset('simon3000/genshin-voice', split='train', streaming=True)

	# 过滤条件：中文、有转录、类型为对话
	filtered_data = dataset.filter(
	lambda x: (
	x['language'] == 'Chinese' and
	x['transcription'] != '' and
	x['type'] == 'Dialog'
	)
	)

	# 动态统计speaker计数和文件下载
	speaker_counts = defaultdict(int)
	speaker_file_indices = defaultdict(int)

	os.makedirs(output_folder, exist_ok=True)

	for voice in filtered_data:
	speaker = voice['speaker']

	# 如果该speaker已下载足够文件，跳过
	if speaker_counts[speaker] >= max_files_per_speaker:
	continue

	# 更新speaker计数
	speaker_counts[speaker] += 1
	file_num = str(speaker_file_indices[speaker] + 1).zfill(5) # 从00001开始

	# 创建speaker子文件夹
	speaker_folder = os.path.join(output_folder, speaker)
	os.makedirs(speaker_folder, exist_ok=True)

	# 构建文件路径
	audio_path = os.path.join(speaker_folder, f'{speaker}_{file_num}.wav')
	transcription_path = os.path.join(speaker_folder, f'{speaker}_{file_num}.txt')

	# 处理音频数据（兼容bytes或path格式）
	audio_data = voice['audio']
	try:
	if 'bytes' in audio_data and audio_data['bytes'] is not None:
	# 从bytes直接读取音频
	with io.BytesIO(audio_data['bytes']) as audio_bytes:
	data, samplerate = sf.read(audio_bytes)
	sf.write(audio_path, data, samplerate)

	#elif 'path' in audio_data and os.path.exists(audio_data['path']):
	# 如果提供path且文件存在，直接复制
	#data, samplerate = sf.read(audio_data['path'])
	#sf.write(audio_path, data, samplerate)

	else:
	print(f"警告: {speaker}的音频数据格式不支持，跳过")
	speaker_counts[speaker] -= 1 # 回滚计数
	continue
	except Exception as e:
	print(f"处理{speaker}的音频时出错: {str(e)}")
	speaker_counts[speaker] -= 1
	continue

	# 保存转录文件
	with open(transcription_path, 'w', encoding='utf-8') as f:
	f.write(voice['transcription'])

	speaker_file_indices[speaker] += 1
	print(
	f"[下载进度] {speaker}_{file_num} \| "
	f"进度: {speaker_counts[speaker]}/{max_files_per_speaker}"
	)

	# 打印最终统计
	print("\n=== 下载结果 ===")
	for speaker, count in speaker_counts.items():
	print(f"{speaker}: {count}个文件")

	if __name__ == '__main__':
	download_voices_with_dynamic_counting()


	from gradio_client import Client, handle_file

	client = Client("http://localhost:7860")
	result = client.predict(
	prompt=handle_file('genshin_voices_sample_5/Ahangar/Ahangar_00001.wav'),
	text="偷窃者没有好下场",
	api_name="/gen_single"
	)
	print(result)
	from shutil import copy2
	copy2(result["value"], result["value"].split("/")[-1])

	'''

	import spaces
	import os
	import shutil
	import threading
	import time
	import sys

	from huggingface_hub import snapshot_download

	current_dir = os.path.dirname(os.path.abspath(__file__))
	sys.path.append(current_dir)
	sys.path.append(os.path.join(current_dir, "indextts"))

	import gradio as gr
	from indextts.infer import IndexTTS
	from tools.i18n.i18n import I18nAuto

	i18n = I18nAuto(language="zh_CN")
	MODE = 'local'
	snapshot_download("IndexTeam/IndexTTS-1.5",local_dir="checkpoints",)
	tts = IndexTTS(model_dir="checkpoints", cfg_path="checkpoints/config.yaml")

	os.makedirs("outputs/tasks",exist_ok=True)
	os.makedirs("prompts",exist_ok=True)

	@spaces.GPU
	def infer(voice, text,output_path=None):
	if not tts:
	raise Exception("Model not loaded")
	if not output_path:
	output_path = os.path.join("outputs", f"spk_{int(time.time())}.wav")
	tts.infer(voice, text, output_path)
	return output_path

	def tts_api(voice, text):
	try:
	output_path = infer(voice, text)
	with open(output_path, "rb") as f:
	audio_bytes = f.read()
	return (200, {}, audio_bytes)
	except Exception as e:
	return (500, {"error": str(e)}, None)

	def gen_single(prompt, text):
	output_path = infer(prompt, text)
	return gr.update(value=output_path,visible=True)

	def update_prompt_audio():
	update_button = gr.update(interactive=True)
	return update_button

	with gr.Blocks() as demo:
	mutex = threading.Lock()
	gr.HTML('''
	<h2><center>IndexTTS: An Industrial-Level Controllable and Efficient Zero-Shot Text-To-Speech System</h2>

	<p align="center">
	<a href='https://arxiv.org/abs/2502.05512'><img src='https://img.shields.io/badge/ArXiv-2502.05512-red'></a>
	''')
	with gr.Tab("音频生成"):
	with gr.Row():
	os.makedirs("prompts",exist_ok=True)
	prompt_audio = gr.Audio(label="请上传参考音频",key="prompt_audio",
	sources=["upload","microphone"],type="filepath")
	prompt_list = os.listdir("prompts")
	default = ''
	if prompt_list:
	default = prompt_list[0]
	input_text_single = gr.Textbox(label="请输入目标文本",key="input_text_single")
	gen_button = gr.Button("生成语音",key="gen_button",interactive=True)
	output_audio = gr.Audio(label="生成结果", visible=False,key="output_audio")

	prompt_audio.upload(update_prompt_audio,
	inputs=[],
	outputs=[gen_button])

	gen_button.click(gen_single,
	inputs=[prompt_audio, input_text_single],
	outputs=[output_audio])

	# 移除 Interface 相关内容，避免重复渲染
	# 只保留 Blocks demo，UI和API共用
	# 这样既有UI，也能通过Gradio HTTP API调用
	# 通过POST /run/predict即可API调用

	# 移除 add_api_route 和 mount_gradio_app，Spaces 不支持

	def main():
	tts.load_normalizer()
	demo.launch(server_name="0.0.0.0", server_port=7860, share = True)

	if __name__ == "__main__":
	main()