import asyncio
import os
import tempfile

# On Windows, ctranslate2 and torch can load separate OpenMP runtimes.
# Allowing duplicates avoids process aborts during model initialization.
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")

_WHISPER_MODEL_CACHE = {}
_WHISPER_MODEL_LOCK = asyncio.Lock()
_WHISPER_RUNTIME_FORCE_CPU = False
_WHISPER_LAST_ERROR: str | None = None


def _is_cuda_runtime_error(error: Exception) -> bool:
    message = str(error or "").strip().lower()
    if not message:
        return False
    markers = (
        "cublas64_12.dll",
        "cublas",
        "cudnn",
        "libcudart",
        "cuda",
        "ctranslate2",
        "failed to load library",
        "cannot be loaded",
    )
    return any(marker in message for marker in markers)


def _force_whisper_cpu_mode(reason: Exception | None = None) -> None:
    global _WHISPER_RUNTIME_FORCE_CPU, _WHISPER_LAST_ERROR
    _WHISPER_RUNTIME_FORCE_CPU = True
    if reason is not None:
        _WHISPER_LAST_ERROR = str(reason)

    # Drop cached CUDA models so all future requests resolve to CPU safely.
    for key in list(_WHISPER_MODEL_CACHE.keys()):
        if "|cuda|" in key:
            _WHISPER_MODEL_CACHE.pop(key, None)


def _has_cuda_device_via_ctranslate2() -> bool:
    try:
        import ctranslate2

        return ctranslate2.get_cuda_device_count() > 0
    except Exception:
        return False


def _resolve_device() -> str:
    if _WHISPER_RUNTIME_FORCE_CPU:
        return "cpu"

    pref = os.getenv("WHISPER_DEVICE", "auto").strip().lower()
    if pref in {"cpu", "cuda"}:
        return pref

    # Prefer ctranslate2 probe first because faster-whisper relies on it.
    if _has_cuda_device_via_ctranslate2():
        return "cuda"

    try:
        import torch

        return "cuda" if torch.cuda.is_available() else "cpu"
    except Exception:
        return "cpu"


def _resolve_compute_type(device: str) -> str:
    pref = os.getenv("WHISPER_COMPUTE_TYPE", "auto").strip().lower()
    if pref and pref != "auto":
        return pref
    return "float16" if device == "cuda" else "int8"


def _resolve_model_size() -> str:
    # Fast default for real-time interview UX; can be overridden in env.
    return os.getenv("WHISPER_MODEL_SIZE", "small.en").strip() or "small.en"


def _resolve_beam_size() -> int:
    try:
        return max(1, int(os.getenv("WHISPER_BEAM_SIZE", "1")))
    except Exception:
        return 1


def _resolve_best_of() -> int:
    try:
        return max(1, int(os.getenv("WHISPER_BEST_OF", "1")))
    except Exception:
        return 1


def _resolve_vad_filter() -> bool:
    value = os.getenv("WHISPER_VAD_FILTER", "0").strip().lower()
    return value in {"1", "true", "yes", "on"}


async def _get_whisper_model():
    model_size = _resolve_model_size()
    device = _resolve_device()
    compute_type = _resolve_compute_type(device)
    cache_key = f"{model_size}|{device}|{compute_type}"

    async with _WHISPER_MODEL_LOCK:
        if cache_key in _WHISPER_MODEL_CACHE:
            return _WHISPER_MODEL_CACHE[cache_key]

        def _load_model():
            try:
                from faster_whisper import WhisperModel
            except Exception as exc:
                raise RuntimeError(
                    "faster-whisper is not installed in the active Python environment"
                ) from exc

            try:
                return WhisperModel(model_size, device=device, compute_type=compute_type)
            except Exception as exc:
                if device == "cuda" and _is_cuda_runtime_error(exc):
                    _force_whisper_cpu_mode(exc)
                # Keep service resilient if GPU config mismatches runtime.
                return WhisperModel(model_size, device="cpu", compute_type="int8")

        model = await asyncio.to_thread(_load_model)
        _WHISPER_MODEL_CACHE[cache_key] = model
        return model


async def warmup_whisper_model() -> None:
    try:
        await _get_whisper_model()
    except Exception:
        # Best-effort warmup only.
        pass


async def transcribe_audio_bytes(audio_bytes: bytes, filename: str = "speech.webm", language: str = "en") -> str:
    if not audio_bytes:
        raise ValueError("audio file is required")

    model = await _get_whisper_model()
    ext = os.path.splitext(filename or "speech.webm")[1] or ".webm"
    target_language = (language or "en").strip().lower() or "en"
    beam_size = _resolve_beam_size()
    best_of = _resolve_best_of()
    vad_filter = _resolve_vad_filter()

    fd, tmp_path = tempfile.mkstemp(suffix=ext)
    os.close(fd)

    try:
        with open(tmp_path, "wb") as f:
            f.write(audio_bytes)

        def _transcribe(model_instance) -> str:
            segments, _ = model_instance.transcribe(
                tmp_path,
                language=target_language,
                beam_size=beam_size,
                best_of=best_of,
                vad_filter=vad_filter,
                condition_on_previous_text=False,
                temperature=0.0,
                without_timestamps=True,
            )
            parts = []
            for seg in segments:
                text = (seg.text or "").strip()
                if text:
                    parts.append(text)
            return " ".join(parts).strip()

        try:
            text = await asyncio.to_thread(_transcribe, model)
        except Exception as exc:
            if not _is_cuda_runtime_error(exc):
                raise RuntimeError(f"Whisper transcription failed: {str(exc)}") from exc

            # Runtime CUDA failures can occur even after successful model construction.
            _force_whisper_cpu_mode(exc)
            cpu_model = await _get_whisper_model()
            try:
                text = await asyncio.to_thread(_transcribe, cpu_model)
            except Exception as retry_exc:
                raise RuntimeError(
                    f"Whisper transcription failed after CPU fallback: {str(retry_exc)}"
                ) from retry_exc

        return text
    finally:
        if os.path.exists(tmp_path):
            os.remove(tmp_path)