| import os |
| import csv |
| import wave |
| import re |
| import json |
| from pathlib import Path |
| import subprocess |
| from subprocess import CompletedProcess |
|
|
| def add_text_index(): |
| text_file = '../test_data/recordings/text/test_asr_zh.txt' |
| index = 1 |
| text_dict = {} |
| with open(text_file, encoding='utf-8') as f: |
| for line in f: |
| line = line.strip() |
| |
| if not line: |
| continue |
| if line.startswith('#'): |
| |
| continue |
| text_dict[f"{index}.wav"] = line |
| index += 1 |
| with open('../test_data/recordings/data.json', "w", encoding="utf-8") as f: |
| json.dump(text_dict, f, ensure_ascii=False, indent=2) |
|
|
| def get_lines_with_index(filepath): |
| with open(filepath, encoding='utf-8') as f: |
| for line in f: |
| line = line.strip() |
| m = re.match(r'^(\d+)\.\s*(.*)', line) |
| if m: |
| yield m.group(1), m.group(2) |
|
|
| def get_wav_length(wav_path): |
| try: |
| with wave.open(wav_path, 'rb') as wf: |
| frames = wf.getnframes() |
| rate = wf.getframerate() |
| duration = frames / float(rate) |
| return duration |
| except Exception as e: |
| print(f"Error reading {wav_path}: {e}") |
| return 0 |
|
|
| def write_csv(rows, output_csv): |
| with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: |
| writer = csv.writer(csvfile) |
| writer.writerow(['序号', '文本', '音频长度(秒)']) |
| writer.writerows(rows) |
|
|
| def print_text_and_audio_length(): |
| text_file = '../test_data/recordings/text/test_asr_zh_with_index.txt' |
| audio_folder = '../test_data/recordings' |
| output_csv = 'csv/text_audio_length.csv' |
| rows = [] |
| for idx, text in get_lines_with_index(text_file): |
| |
| |
| audio_path = os.path.join(audio_folder, f"{idx}.wav") |
| audio_length = get_wav_length(audio_path) |
| audio_length = round(audio_length, 2) if audio_length is not None else None |
| |
| rows.append([idx, text, round(audio_length,2)]) |
| write_csv(rows, output_csv) |
|
|
| def get_text_distance(text1, text2): |
| from lib.utils import run_textdistance, clean_text_for_comparison_zh, highlight_diff |
| text1_clean = clean_text_for_comparison_zh(text1) |
| text2_clean = clean_text_for_comparison_zh(text2) |
| d, nd = run_textdistance(text1_clean, text2_clean) |
| diff = highlight_diff(text1_clean, text2_clean, spliter="") |
| return d, nd, diff |
|
|
| def get_origin_text_dict(): |
| text_file = '../test_data/recordings/text/test_asr_zh_with_index.txt' |
| text_dict = {} |
| for idx, text in get_lines_with_index(text_file): |
| text_dict[idx] = text |
| return text_dict |
|
|
|
|
| |
| |
|
|
| if __name__ == '__main__': |
| add_text_index() |
| |
| |