import argparse
import os
import time
from pathlib import Path
import csv
import json
import yaml
from typing import List, Dict, Optional


import librosa
import openvino_genai
import numpy as np
from scripts.funasr_ct.ct_transformer import CT_Transformer
from scripts.asr_utils import get_origin_text_dict, get_text_distance

def save_csv(file_path, rows):
    with open(file_path, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerows(rows)
        print(f"write csv to {file_path}")


def load_audio(audio_path: str, sr: int = 16000):
    # 读取音频并转成 16k 单声道 numpy float32
    audio, _ = librosa.load(audio_path, sr=sr, mono=True)
    return audio

def read_wav(filepath):
    raw_speech, samplerate = librosa.load(filepath, sr=16000)
    return raw_speech.tolist()

def transcribe_file(
        audio_path: str,
        model,
        lang="en"
):
    raw_speech = read_wav(audio_path)
    res = model.generate(raw_speech,language=lang)
    # print(res.texts)
    # print(f"inference time: {time.time() - t0}")
    return str(res)
def load_model(device):
    device = "GPU"  # GPU can be used as well
    # model_path = r"D:\yujuan\yoyo-translator-win\models\whisper-large-v3-turbo-int8"
    model_path = r"D:\yujuan\models\whisper-turbo-39000-int8p\whisper-turbo-39000-int8p"
    punc_model = r"D:\yujuan\models\funasr_ct\ct-punc"

    t0 = time.time()
    asr = openvino_genai.WhisperPipeline(model_path, device)
    punc = CT_Transformer(punc_model, device=device)
    print("load model time: ", time.time() - t0)
    return asr, punc

def inference(audio: Path, asr, punc, lang):
    try:
        t0 = time.time()
        asr_text = transcribe_file(
            str(audio), asr, lang
        )
        t1 = time.time()
        if lang =="<|zh|>":
            punc_text = punc(asr_text)[0]
        else:
            punc_text = asr_text
        t2 = time.time()
        print(f"{audio.name} -> {asr_text} -> {punc_text}; \n asr cost: {t1-t0}; punc cost: {t2-t1}")
        return punc_text, t2-t0
    except Exception as e:
        print(f"{audio.name} -> 失败: {e}")
def run_test_audios():
    device = "GPU"  # GPU can be used as well
    lang = "<|en|>"
    asr, punc = load_model(device)

    audios = Path(r"D:\yujuan\TestTranslator\tests\test_data\test_audios")
    rows = [["file_name", "time", "inference_result"]]
    for audio in sorted(audios.glob("*en*/*.wav")): # *s/randomforest*.wav"
        text, t = inference(audio, asr, punc, lang)
        rows.append([f"{audio.parent.name}/{audio.name}", t, text])
    save_csv("csv/finetune_whisper_with_punc.csv", rows)

def run_recordings():
    device = "GPU"  # GPU can be used as well
    lang = "<|zh|>"
    asr, punc = load_model(device)

    audios = Path(r"D:\yujuan\TestTranslator\tests\test_data\recordings")
    rows = [["file_name", "time", "inference_result"]]
    original = get_origin_text_dict()
    for audio in sorted(audios.glob("*.wav"), key=lambda x: int(x.stem)):
        text, t = inference(audio, asr, punc, lang)
        d, nd, diff = get_text_distance(original[audio.stem], text)
        rows.append([audio.name, round(t, 3), text, d, round(nd,3), diff])
    save_csv("csv/finetune_whisper_with_punc.csv", rows)

def run_test_dataset():
    from test_data.audios import read_dataset
    device = "GPU"
    lang = "<|zh|>"
    asr, punc = load_model(device)

    test_data = Path("../tests/test_data/dataset.txt")
    audio_parent = Path("../tests/test_data/")
    rows = [["file_name", "time", "inference_result"]]
    result_list = []
    count = 0
    try:
        for audio_path, sentence, duration in read_dataset(test_data):
            count += 1
            print(f"processing {count}: {audio_path}")

            text, t = inference(audio_parent/audio_path, asr, punc, lang)
            print("inference time:", t)
            print(text)
            result_list.append({
                "index": count,
                "audio_path": audio_path,
                "reference": sentence,
                "duration": duration,
                "inference_time": round(t, 3),
                "inference_result": text
            })
    except Exception as e:
        print(e)
    except KeyboardInterrupt as e:
        print(e)
    import json
    with open("csv/whisper_finetune_ov_results.json", "w", encoding="utf-8") as f:
        json.dump(result_list, f, ensure_ascii=False, indent=2)


def run_test_emilia():
    from test_data.audios import read_emilia
    device = "GPU"
    lang = "<|zh|>"
    asr, punc = load_model(device)
    parent = Path("../tests/test_data/ZH-B000000")
    result_list = []
    count = 0
    try:
        for audio_path, sentence, duration in read_emilia(parent, count_limit=5000):
            count += 1
            print(f"processing {count}: {audio_path.name}")

            text, t = inference(audio_path, asr, punc, lang)
            print("inference time:", t)
            print(text)
            result_list.append({
                "index": count,
                "audio_path": audio_path.name,
                "reference": sentence,
                "duration": duration,
                "inference_time": round(t, 3),
                "inference_result": text
            })
    except Exception as e:
        print(e)
    except KeyboardInterrupt as e:
        print(e)
    import json
    with open("csv/whisper_finetune_emilia_ov_results.json", "w", encoding="utf-8") as f:
        json.dump(result_list, f, ensure_ascii=False, indent=2)


if __name__ == "__main__":
    # main()
    run_test_emilia()