from pywhispercpp.model import Model from pathlib import Path import time import csv from scripts.asr_utils import get_origin_text_dict, get_text_distance def save_csv(file_path, rows): with open(file_path, "w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerows(rows) print(f"write csv to {file_path}") def load_model(): models_dir = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models") whisper_model = 'large-v3-turbo-q5_0' t0 = time.time() model = Model( model=whisper_model, models_dir=models_dir, print_realtime=False, print_progress=False, print_timestamps=False, translate=False, # beam_search=1, temperature=0., no_context=True ) print("load model time: ", time.time()-t0) return model def run_recordings(): model = load_model() audios = Path("../test_data/recordings/") rows = [["file_name", "time", "inference_result"]] original = get_origin_text_dict() for audio in sorted(audios.glob("*.wav"), key=lambda x: int(x.stem)): print(audio) t1 = time.time() output = model.transcribe(str(audio), language="zh", initial_prompt="以下是普通话句子,这是一段会议内容。")# initial_prompt="这是一段中文的会议内容。" t = time.time() - t1 print("inference time:", t) text = " ".join([a.text for a in output]) print(text) d, nd, diff = get_text_distance(original[audio.stem], text) rows.append([audio.name, round(t, 3), text, d, round(nd,3), diff]) save_csv("csv/pywhisper.csv", rows) def run_test_audios(): model = load_model() lang = "zh" audios = Path("../test_data/audio_clips/") rows = [["file_name", "time", "inference_result"]] for audio in sorted(audios.glob(f"*{lang}*/*.wav")): print(audio) t1 = time.time() output = model.transcribe(str(audio), language=lang, initial_prompt="以下是普通话句子,这是一段会议内容。")# initial_prompt="这是一段中文的会议内容。" t = time.time() - t1 print("inference time:", t) text = " ".join([a.text for a in output]) print(text) rows.append([f"{audio.parent.name}/{audio.name}", round(t, 3), text]) save_csv("csv/whisper.csv", rows) def run_test_dataset(): from test_data.audios import read_dataset model = load_model() test_data = Path("../test_data/AIShell/dataset/dataset.txt") audio_parent = Path("../test_data/") rows = [["file_name", "time", "inference_result"]] result_list = [] count = 0 try: for audio_path, sentence, duration in read_dataset(test_data): count += 1 print(f"processing {count}: {audio_path}") t1 = time.time() output = model.transcribe(str(audio_parent/audio_path), language="zh")# , initial_prompt="以下是普通话句子,这是一段会议内容。" t = time.time() - t1 print("inference time:", t) text = " ".join([a.text for a in output]) print(text) result_list.append({ "index": count, "audio_path": audio_path, "reference": sentence, "duration": duration, "inference_time": round(t, 3), "inference_result": text }) except Exception as e: print(e) except KeyboardInterrupt as e: print(e) import json with open("csv/whisper_dataset_results.json", "w", encoding="utf-8") as f: json.dump(result_list, f, ensure_ascii=False, indent=2) def run_test_emilia(): from test_data.audios import read_emilia model = load_model() parent = Path("../test_data/ZH-B000000") result_list = [] count = 0 try: for audio_path, sentence, duration in read_emilia(parent, count_limit=5000): count += 1 print(f"processing {count}: {audio_path.name}") t1 = time.time() output = model.transcribe(str(audio_path), language="zh")# , initial_prompt="以下是普通话句子,这是一段会议内容。" t = time.time() - t1 print("inference time:", t) text = " ".join([a.text for a in output]) print(text) result_list.append({ "index": count, "audio_path": audio_path.name, "reference": sentence, "duration": duration, "inference_time": round(t, 3), "inference_result": text }) except Exception as e: print(e) except KeyboardInterrupt as e: print(e) import json with open("csv/whisper_emilia_results.json", "w", encoding="utf-8") as f: json.dump(result_list, f, ensure_ascii=False, indent=2) def run_test_st(): from test_data.audios import read_st model = load_model() # parent = Path("../test_data/ST-CMDS-20170001_1-OS") result_list = [] count = 0 try: for audio_path, sentence in read_st(count_limit=5000): count += 1 print(f"processing {count}: {audio_path}") t1 = time.time() output = model.transcribe( str(audio_path), language="zh" ) t = time.time() - t1 print("inference time:", t) text = " ".join([a.text for a in output]) print(text) result_list.append({ "index": count, "audio_path": audio_path.name, "reference": sentence, # "duration": duration, "inference_time": round(t, 3), "inference_result": text }) except Exception as e: print(e) except KeyboardInterrupt as e: print(e) import json with open("csv/whisper_st_results.json", "w", encoding="utf-8") as f: json.dump(result_list, f, ensure_ascii=False, indent=2) def run_test_wenet(): from test_data.audios import read_wenet model = load_model() result_list = [] count = 0 try: for audio_path, sentence in read_wenet(count_limit=5000): count += 1 print(f"processing {count}: {audio_path}") t1 = time.time() output = model.transcribe( str(audio_path), language="zh" ) t = time.time() - t1 print("inference time:", t) text = " ".join([a.text for a in output]) print(text) result_list.append({ "index": count, "audio_path": audio_path.name, "reference": sentence, # "duration": duration, "inference_time": round(t, 3), "inference_result": text }) except Exception as e: print(e) except KeyboardInterrupt as e: print(e) import json with open("csv/whisper_wenet_results.json", "w", encoding="utf-8") as f: json.dump(result_list, f, ensure_ascii=False, indent=2) if __name__ == '__main__': # run_test_emilia() # run_recordings() run_test_wenet()