| import json |
| from pathlib import Path |
| from lib.utils import cmd |
| from environment import TEST_DATA |
|
|
| def read_json(): |
|
|
| test_net = {"audios": []} |
| test_meeting = {"audios": []} |
| dev = {"audios": []} |
| small = {"audios": []} |
| with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/WenetSpeech.json', 'r') as f: |
| data = json.load(f) |
| for audio in data["audios"]: |
| |
| for seg in audio["segments"]: |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| if "S" in seg["subsets"]: |
| small["audios"].append(audio) |
| continue |
| |
| |
| |
| |
| |
| |
| |
| with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_SMALL.json', 'w') as f: |
| json.dump(small, f, indent=4) |
|
|
| def move_wenet(folder: Path=TEST_DATA/"wenet", json_file="WenetSpeech_TEST_NET.json", count_limit=None): |
| """读取 wenet 数据集,返回音频路径、文本、时长, |
| """ |
| count = 0 |
| with open(folder/json_file, encoding="utf-8") as f: |
| data = json.load(f) |
| audios = data["audios"] |
| print(f"Total {len(audios)} samples in {json_file}") |
| segment_sum = 0 |
| for a in audios: |
| print(a["path"]) |
| segs = len(a["segments"]) |
| if segs < 100: |
| segment_sum += segs |
| print("segments number:", segment_sum) |
| |
| |
| |
| |
| |
| |
|
|
| if __name__ == "__main__": |
| move_wenet() |
| |