yujuanqin
/

TestTranslator

Model card Files Files and versions

TestTranslator / scripts /wenet_utils.py

yujuanqin's picture

add asr test

db0d138 4 months ago

history blame contribute delete

2.57 kB

	import json
	from pathlib import Path
	from lib.utils import cmd
	from environment import TEST_DATA

	def read_json():

	test_net = {"audios": []}
	test_meeting = {"audios": []}
	dev = {"audios": []}
	small = {"audios": []}
	with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/WenetSpeech.json', 'r') as f:
	data = json.load(f)
	for audio in data["audios"]:
	# print(audio["path"], audio["duration"])
	for seg in audio["segments"]:
	# if "TEST_NET" in seg["subsets"]:
	# test_net["audios"].append(audio)
	# break
	# if "TEST_MEETING" in seg["subsets"]:
	# test_meeting["audios"].append(audio)
	# break
	# if "DEV" in seg["subsets"]:
	# dev["audios"].append(audio)
	# break
	if "S" in seg["subsets"]:
	small["audios"].append(audio)
	continue

	# with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/WenetSpeech_TEST_NET.json', 'w') as f:
	# json.dump(test_net, f, indent=4)
	# with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_TEST_MEETING.json', 'w') as f:
	# json.dump(test_meeting, f, indent=4)
	# with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_DEV.json', 'w') as f:
	# json.dump(dev, f, indent=4)
	with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_SMALL.json', 'w') as f:
	json.dump(small, f, indent=4)

	def move_wenet(folder: Path=TEST_DATA/"wenet", json_file="WenetSpeech_TEST_NET.json", count_limit=None):
	"""读取 wenet 数据集，返回音频路径、文本、时长,
	"""
	count = 0
	with open(folder/json_file, encoding="utf-8") as f:
	data = json.load(f)
	audios = data["audios"]
	print(f"Total {len(audios)} samples in {json_file}")
	segment_sum = 0
	for a in audios:
	print(a["path"])
	segs = len(a["segments"])
	if segs < 100:
	segment_sum += segs
	print("segments number:", segment_sum)
	# remote_path = f"/home/ubuntu/data_1/yujuan/dataset_untar/{a['path']}"
	# local_dir = Path(f"/Users/jeqin/work/code/TestTranslator/test_data/wenet/{a['path']}").parent
	# if not local_dir.exists():
	# local_dir.mkdir(parents=True, exist_ok=True)
	# command = f"scp ubuntu@192.168.110.49:{remote_path} {local_dir}/"
	# cmd(command)

	if __name__ == "__main__":
	move_wenet()