# https://modelscope.cn/models/iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary
# https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2
import os
import re
import sys
import time
import traceback
from typing import Any, Dict, List, Tuple

import librosa
import numpy as np
from funasr import AutoModel


def _build_words_with_gaps(raw_words, raw_timestamps, wav_fn: str):
    words, word_durs = [], []
    prev = 0.0
    for w, t in zip(raw_words, raw_timestamps):
        s, e = float(t[0]), float(t[1])
        if s > prev:
            words.append("<SP>")
            word_durs.append(s - prev)
        words.append(w)
        word_durs.append(e - s)
        prev = e

    wav_len = librosa.get_duration(filename=wav_fn)
    if wav_len > prev:
        if len(words) == 0:
            words.append("<SP>")
            word_durs.append(wav_len)
            return words, word_durs
        if words[-1] != "<SP>":
            words.append("<SP>")
            word_durs.append(wav_len - prev)
        else:
            word_durs[-1] += wav_len - prev

    return words, word_durs

def _word_dur_post_process(words, word_durs, f0):
    """Post-process word durations using f0 to better place silences.
    """
    # f0 time grid parameters
    sr = 24000  # f0 sample rate
    hop_length = 480  # f0 hop length

    # Convert word durations (seconds) to frame boundaries on the f0 grid.
    boundaries = np.cumsum([
        0,
        *[
            int(dur * sr / hop_length)
            for dur in word_durs
        ],
    ]).tolist()

    sil_tolerance = 5   # tolerance frames for silence detection
    ext_tolerance = 5   # tolerance frames for vocal extension

    new_words: list[str] = []
    new_word_durs: list[float] = []
    if words:
        new_words.append(words[0])
        new_word_durs.append(word_durs[0])

    for i in range(1, len(words)):
        word = words[i]
        if word == "<SP>":
            start_frame = boundaries[i]
            end_frame = boundaries[i + 1]

            num_frames = end_frame - start_frame
            frame_idx = start_frame

            # Find first region with at least 5 consecutive "unvoiced" frames.
            unvoiced_count = 0
            while frame_idx < end_frame:
                if f0[frame_idx] <= 1:  # unvoiced
                    unvoiced_count += 1
                    if unvoiced_count >= sil_tolerance:
                        frame_idx -= sil_tolerance - 1  # back to the last voiced frame
                        break
                else:
                    unvoiced_count = 0
                frame_idx += 1

            voice_frames = frame_idx - start_frame

            if voice_frames >= int(num_frames * 0.9):  # over 90% voiced
                # Treat the whole "<SP>" as silence and merge into previous word.
                new_word_durs[-1] += word_durs[i]
            elif voice_frames >= ext_tolerance:  # over 5 frames voiced
                # Split the "<SP>" into two parts: leading silence and tail kept as "<SP>".
                dur = voice_frames * hop_length / sr
                new_word_durs[-1] += dur
                new_words.append("<SP>")
                new_word_durs.append(word_durs[i] - dur)
            else:
                # Too short to adjust, keep as-is.
                new_words.append(word)
                new_word_durs.append(word_durs[i])
        else:
            new_words.append(word)
            new_word_durs.append(word_durs[i])

    return new_words, new_word_durs


class _ASRZhModel:
    """Mandarin/Cantonese ASR wrapper."""

    def __init__(self, model_path: str, device: str):
        self.model = AutoModel(
            model=model_path,
            disable_update=True,
            device=device,
        )

    def process(self, wav_fn):
        out = self.model.generate(wav_fn, output_timestamp=True)[0]
        raw_words = out["text"].replace("@", "").split(" ")
        raw_timestamps = [[t[0] / 1000, t[1] / 1000] for t in out["timestamp"]]
        words, word_durs = _build_words_with_gaps(raw_words, raw_timestamps, wav_fn)

        if os.path.exists(wav_fn.replace(".wav", "_f0.npy")):
            words, word_durs = _word_dur_post_process(
                words, word_durs, np.load(wav_fn.replace(".wav", "_f0.npy"))
            )

        return words, word_durs


class _ASREnModel:
    """English ASR wrapper for NeMo Parakeet-TDT."""

    def __init__(self, model_path: str, device: str):
        try:
            import nemo.collections.asr as nemo_asr  # type: ignore
        except Exception as e:
            # Print the actual error causing the import failure
            print(f"[lyric transcription] Failed to import nemo.collections.asr: {e}", file=sys.stderr)
            traceback.print_exc()
            raise ImportError(
                "NeMo (nemo_toolkit) is required for ASR English but could not be imported. "
                "See the log above for details."
            ) from e

        self.model = nemo_asr.models.ASRModel.restore_from(
            restore_path=model_path,
            map_location=device,
        )
        self.model.eval()
        # Disable CUDA Graphs via the decoding config to avoid
        # "CUDA failure! 35" (cudaErrorInsufficientDriver) on
        # CUDA 12.8 + ZeroGPU where the driver is too old for graph capture.
        # This must be set in the config (not on the decoding_computer) because
        # transcribe(timestamps=True) calls change_decoding_strategy() which
        # rebuilds the decoder from cfg.
        from omegaconf import open_dict
        with open_dict(self.model.cfg.decoding):
            self.model.cfg.decoding.greedy.use_cuda_graph_decoder = False

    @staticmethod
    def _clean_word(word: str) -> str:
        return re.sub(r"[\?\.,:]", "", word).strip()

    @staticmethod
    def _extract_word_segments(output: Any) -> List[Dict[str, Any]]:
        ts = getattr(output, "timestamp", None)
        if not ts or not isinstance(ts, dict):
            return []
        word_ts = ts.get("word")
        return word_ts if isinstance(word_ts, list) else []

    def process(self, wav_fn: str) -> Tuple[List[str], List[float]]:
        outputs = self.model.transcribe(
            [wav_fn],
            timestamps=True,
            batch_size=1,
            num_workers=0,
        )
        output = outputs[0] if outputs else None

        raw_words: List[str] = []
        raw_timestamps: List[List[float]] = []
        if output is not None:
            for w in self._extract_word_segments(output):
                s, e = float(w.get("start", 0.0)), float(w.get("end", 0.0))
                word = self._clean_word(str(w.get("word", "")))
                if word:
                    raw_words.append(word)
                    raw_timestamps.append([s, e])

        words, durs = _build_words_with_gaps(raw_words, raw_timestamps, wav_fn)

        if os.path.exists(wav_fn.replace(".wav", "_f0.npy")):
            words, durs = _word_dur_post_process(
                words, durs, np.load(wav_fn.replace(".wav", "_f0.npy"))
            )

        return words, durs


class LyricTranscriber:
    """Transcribe lyrics from singing voice segment
    """

    def __init__(
        self,
        zh_model_path: str,
        en_model_path: str,
        device: str = "cuda",
        *,
        verbose: bool = True,
    ):
        """Initialize lyric transcriber.

        Args:
            zh_model_path (str): Path to the Chinese model file.
            en_model_path (str): Path to the English model file.
            device (str): Device to use for tensor operations.
            verbose (bool): Whether to print verbose logs.
        """
        self.verbose = verbose

        if self.verbose:
            print(
                "[lyric transcription] init: start:",
                f"device={device}",
                f"model_path={zh_model_path}",
            )

        # Always initialize Chinese ASR.
        self.zh_model = _ASRZhModel(device=device, model_path=zh_model_path)

        # Initialize English ASR eagerly so the model is loaded at global
        # scope where ZeroGPU can hijack CUDA calls properly.
        self.en_model = _ASREnModel(model_path=en_model_path, device=device)

        if self.verbose:
            print("[lyric transcription] init: success")

    def process(self, wav_fn, language: str | None = "Mandarin", *, verbose: bool | None = None):
        """ Lyric transcriber process

        Args:
            wav_fn (str): Path to the audio file.
            language (str | None): Language of the audio. Defaults to "Mandarin". Supports "Mandarin", "Cantonese" and "English".
            verbose (bool | None): Whether to print verbose logs. Defaults to None.
        """
        v = self.verbose if verbose is None else verbose
        if language not in {"Mandarin", "Cantonese", "English"}:
            raise ValueError(f"Unsupported language: {language}, should be one of ['Mandarin', 'Cantonese', 'English']")
        if v:
            print(f"[lyric transcription] process: start: wav_fn={wav_fn} language={language}")
            t0 = time.time()

        lang = (language or "auto").lower()
        if lang in {"english"}:
            out = self.en_model.process(wav_fn)
        else:
            out = self.zh_model.process(wav_fn)

        if v:
            words, durs = out
            n_words = len(words) if isinstance(words, list) else 0
            dur_sum = float(sum(durs)) if isinstance(durs, list) else 0.0
            dt = time.time() - t0
            print(
                "[lyric transcription] process: done:",
                f"n_words={n_words}",
                f"dur_sum={dur_sum:.3f}s",
                f"time={dt:.3f}s",
            )

        return out


if __name__ == "__main__":
    m = LyricTranscriber(
        zh_model_path="pretrained_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
        en_model_path="pretrained_models/parakeet-tdt-0.6b-v2/parakeet-tdt-0.6b-v2.nemo",
        device="cuda"
    )
    print(m.process("example/test/asr_zh.wav", language="Mandarin"))
    print(m.process("example/test/asr_en.wav", language="English"))