| import argparse |
| import os |
| import time |
| from pathlib import Path |
| import csv |
| import json |
| import yaml |
| from typing import List, Dict, Optional |
|
|
|
|
| import librosa |
| import openvino_genai |
| import numpy as np |
| from scripts.funasr_ct.ct_transformer import CT_Transformer |
| from scripts.asr_utils import get_origin_text_dict, get_text_distance |
|
|
| def save_csv(file_path, rows): |
| with open(file_path, "w", encoding="utf-8", newline="") as f: |
| writer = csv.writer(f) |
| writer.writerows(rows) |
| print(f"write csv to {file_path}") |
|
|
|
|
| def load_audio(audio_path: str, sr: int = 16000): |
| |
| audio, _ = librosa.load(audio_path, sr=sr, mono=True) |
| return audio |
|
|
| def read_wav(filepath): |
| raw_speech, samplerate = librosa.load(filepath, sr=16000) |
| return raw_speech.tolist() |
|
|
| def transcribe_file( |
| audio_path: str, |
| model, |
| lang="en" |
| ): |
| raw_speech = read_wav(audio_path) |
| res = model.generate(raw_speech,language=lang) |
| |
| |
| return str(res) |
| def load_model(device): |
| device = "GPU" |
| |
| model_path = r"D:\yujuan\models\whisper-turbo-39000-int8p\whisper-turbo-39000-int8p" |
| punc_model = r"D:\yujuan\models\funasr_ct\ct-punc" |
|
|
| t0 = time.time() |
| asr = openvino_genai.WhisperPipeline(model_path, device) |
| punc = CT_Transformer(punc_model, device=device) |
| print("load model time: ", time.time() - t0) |
| return asr, punc |
|
|
| def inference(audio: Path, asr, punc, lang): |
| try: |
| t0 = time.time() |
| asr_text = transcribe_file( |
| str(audio), asr, lang |
| ) |
| t1 = time.time() |
| if lang =="<|zh|>": |
| punc_text = punc(asr_text)[0] |
| else: |
| punc_text = asr_text |
| t2 = time.time() |
| print(f"{audio.name} -> {asr_text} -> {punc_text}; \n asr cost: {t1-t0}; punc cost: {t2-t1}") |
| return punc_text, t2-t0 |
| except Exception as e: |
| print(f"{audio.name} -> 失败: {e}") |
| def run_test_audios(): |
| device = "GPU" |
| lang = "<|en|>" |
| asr, punc = load_model(device) |
|
|
| audios = Path(r"D:\yujuan\TestTranslator\tests\test_data\test_audios") |
| rows = [["file_name", "time", "inference_result"]] |
| for audio in sorted(audios.glob("*en*/*.wav")): |
| text, t = inference(audio, asr, punc, lang) |
| rows.append([f"{audio.parent.name}/{audio.name}", t, text]) |
| save_csv("csv/finetune_whisper_with_punc.csv", rows) |
|
|
| def run_recordings(): |
| device = "GPU" |
| lang = "<|zh|>" |
| asr, punc = load_model(device) |
|
|
| audios = Path(r"D:\yujuan\TestTranslator\tests\test_data\recordings") |
| rows = [["file_name", "time", "inference_result"]] |
| original = get_origin_text_dict() |
| for audio in sorted(audios.glob("*.wav"), key=lambda x: int(x.stem)): |
| text, t = inference(audio, asr, punc, lang) |
| d, nd, diff = get_text_distance(original[audio.stem], text) |
| rows.append([audio.name, round(t, 3), text, d, round(nd,3), diff]) |
| save_csv("csv/finetune_whisper_with_punc.csv", rows) |
|
|
| def run_test_dataset(): |
| from test_data.audios import read_dataset |
| device = "GPU" |
| lang = "<|zh|>" |
| asr, punc = load_model(device) |
|
|
| test_data = Path("../tests/test_data/dataset.txt") |
| audio_parent = Path("../tests/test_data/") |
| rows = [["file_name", "time", "inference_result"]] |
| result_list = [] |
| count = 0 |
| try: |
| for audio_path, sentence, duration in read_dataset(test_data): |
| count += 1 |
| print(f"processing {count}: {audio_path}") |
|
|
| text, t = inference(audio_parent/audio_path, asr, punc, lang) |
| print("inference time:", t) |
| print(text) |
| result_list.append({ |
| "index": count, |
| "audio_path": audio_path, |
| "reference": sentence, |
| "duration": duration, |
| "inference_time": round(t, 3), |
| "inference_result": text |
| }) |
| except Exception as e: |
| print(e) |
| except KeyboardInterrupt as e: |
| print(e) |
| import json |
| with open("csv/whisper_finetune_ov_results.json", "w", encoding="utf-8") as f: |
| json.dump(result_list, f, ensure_ascii=False, indent=2) |
|
|
|
|
| def run_test_emilia(): |
| from test_data.audios import read_emilia |
| device = "GPU" |
| lang = "<|zh|>" |
| asr, punc = load_model(device) |
| parent = Path("../tests/test_data/ZH-B000000") |
| result_list = [] |
| count = 0 |
| try: |
| for audio_path, sentence, duration in read_emilia(parent, count_limit=5000): |
| count += 1 |
| print(f"processing {count}: {audio_path.name}") |
|
|
| text, t = inference(audio_path, asr, punc, lang) |
| print("inference time:", t) |
| print(text) |
| result_list.append({ |
| "index": count, |
| "audio_path": audio_path.name, |
| "reference": sentence, |
| "duration": duration, |
| "inference_time": round(t, 3), |
| "inference_result": text |
| }) |
| except Exception as e: |
| print(e) |
| except KeyboardInterrupt as e: |
| print(e) |
| import json |
| with open("csv/whisper_finetune_emilia_ov_results.json", "w", encoding="utf-8") as f: |
| json.dump(result_list, f, ensure_ascii=False, indent=2) |
|
|
|
|
| if __name__ == "__main__": |
| |
| run_test_emilia() |
|
|