| from pywhispercpp.model import Model |
| from pathlib import Path |
| import time |
| import csv |
|
|
| from silero_vad.utils_vad import languages |
|
|
|
|
| def save_csv(file_path, rows): |
| with open(file_path, "w", encoding="utf-8") as f: |
| writer = csv.writer(f) |
| writer.writerows(rows) |
| print(f"write csv to {file_path}") |
|
|
| def run_audios_after_vad(models_dir, audio_dir, model_name): |
| t0 = time.time() |
| model = Model( |
| model=model_name, |
| models_dir=models_dir, |
| print_realtime=False, |
| print_progress=False, |
| print_timestamps=False, |
| translate=False, |
| |
| temperature=0., |
| no_context=True |
| ) |
| print("load model time: ", time.time()-t0) |
| rows = [["lang", "file_name", "inference_time", "python_res", "intel_res"]] |
| for lang in ["es", "fr", "hi", "it", "ja", "pt"]: |
| print("*" * 10, lang, "*"*10) |
| for audio in sorted(list((audio_dir/lang).glob("*.wav"))): |
| print("Audio name:", audio.name) |
| t1 = time.time() |
| output = model.transcribe(str(audio), language=lang) |
| t = time.time() - t1 |
| print("Inference time:", t) |
| |
| text = " ".join([a.text for a in output]) |
| print("Text from Python:", text) |
| try: |
| with open(audio.with_suffix(".txt"), encoding="utf-8") as f: |
| intel_text = f.read().strip() |
| except Exception as e: |
| intel_text = "" |
| print(f"Error reading Intel text for {audio.name}: {e}") |
| print("Text from Intel :", intel_text) |
| rows.append([lang, audio.name, t, text, intel_text]) |
| save_csv("csv/compare_whisper_intel.csv", rows) |
|
|
| def run_long_audios(models_dir, audios_list, model_name): |
| t0 = time.time() |
| model = Model( |
| model=model_name, |
| models_dir=models_dir, |
| print_realtime=False, |
| print_progress=False, |
| print_timestamps=False, |
| translate=False, |
| |
| temperature=0., |
| no_context=True |
| ) |
| print("load model time: ", time.time() - t0) |
| rows = [["file_name", "inference_time", "res_text"]] |
| audios = audios_list.read_text().splitlines() |
| for audio in audios: |
| if not audio: |
| rows.append([]) |
| continue |
| lang = Path(audio).name.split('-')[0] |
| if lang not in ["es", "fr", "hi", "it", "ja", "pt"]: |
| lang = "en" |
| print(f"Audio file: {audio}, lang: {lang}") |
| t1 = time.time() |
| output = model.transcribe(str(audio), language=lang) |
| t = time.time() - t1 |
| print("Inference time:", t) |
| |
| text = " ".join([a.text for a in output]) |
| print("Text:", text) |
| rows.append([audio, t, text]) |
| save_csv("csv/compare_whisper.csv", rows) |
| if __name__ == '__main__': |
| models_dir = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models") |
| |
| model_name = "large-v3-turbo-q8_0" |
| |
| |
| audios_list = Path("/Users/jeqin/work/code/TestTranslator/scripts/audios.txt") |
| |
| run_long_audios(models_dir, audios_list, model_name) |