"""Multimodal Vision-Language Model wrapper med konfigurerbar modell."""
import os
import torch
from PIL import Image

from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    Qwen2VLForConditionalGeneration,
    AutoProcessor,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor as WhisperProcessor,
    pipeline,
)
from qwen_vl_utils import process_vision_info


# Modellkart: vennlig navn -> (model_id, min_vram_gb, klass, processor)
MODEL_REGISTRY = {
    "qwen2.5-vl-3b": {
        "id": "Qwen/Qwen2.5-VL-3B-Instruct",
        "min_vram": 7,
        "class": Qwen2_5_VLForConditionalGeneration,
        "supports_pixels": True,  # Qwen2.5 støtter min/max_pixels
    },
    "qwen2.5-vl-7b": {
        "id": "Qwen/Qwen2.5-VL-7B-Instruct",
        "min_vram": 16,
        "class": Qwen2_5_VLForConditionalGeneration,
        "supports_pixels": True,
    },
    "qwen2-vl-2b": {
        "id": "Qwen/Qwen2-VL-2B-Instruct",
        "min_vram": 5,
        "class": Qwen2VLForConditionalGeneration,
        "supports_pixels": False,  # Qwen2 bruker annen syntaks
    },
}


def _detect_gpu_vram() -> int:
    """Returner estimert GPU VRAM i GB."""
    if not torch.cuda.is_available():
        return 0
    return torch.cuda.get_device_properties(0).total_memory // (1024 ** 3)


def _pick_model(preferred: str = None) -> dict:
    """Velg beste modell basert på VRAM og preferanse."""
    vram = _detect_gpu_vram()
    print("[config] Oppdaget VRAM: %d GB" % vram)

    if preferred and preferred in MODEL_REGISTRY:
        model = MODEL_REGISTRY[preferred]
        if vram >= model["min_vram"]:
            print("[config] Bruker foretrukken modell: %s" % preferred)
            return model
        print("[config] Advarsel: %s trenger %dGB, har bare %dGB" % (preferred, model["min_vram"], vram))

    # Auto-velg største modell som passer i VRAM
    for name in ["qwen2.5-vl-7b", "qwen2.5-vl-3b", "qwen2-vl-2b"]:
        model = MODEL_REGISTRY[name]
        if vram >= model["min_vram"]:
            print("[config] Auto-valgte modell: %s (%s)" % (name, model["id"]))
            return model

    print("[config] Fallback: qwen2-vl-2b (minst VRAM-krav)")
    return MODEL_REGISTRY["qwen2-vl-2b"]


class MultimodalAssistant:
    """
    Konfigurerbar multimodal assistent.

    Miljøvariabler:
      BUDDY_VLM_MODEL  -- qwen2.5-vl-3b | qwen2.5-vl-7b | qwen2-vl-2b
      BUDDY_STT_MODEL  -- openai/whisper-large-v3 | openai/whisper-medium | openai/whisper-small
      BUDDY_DEVICE     -- auto | cuda | cpu
    """

    def __init__(
        self,
        vlm_model: str = None,
        whisper_model: str = None,
        device: str = None,
    ):
        # --- Konfigurer ---
        vlm_cfg = _pick_model(vlm_model or os.environ.get("BUDDY_VLM_MODEL"))
        whisper_id = whisper_model or os.environ.get("BUDDY_STT_MODEL", "openai/whisper-large-v3")
        device = device or os.environ.get("BUDDY_DEVICE", "auto")

        self._vlm_model_id = vlm_cfg["id"]
        self._vlm_class = vlm_cfg["class"]
        self._supports_pixels = vlm_cfg["supports_pixels"]
        self.device = device
        self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

        # --- Last VLM ---
        print("[assistant] Laster VLM: %s ..." % self._vlm_model_id)
        self.vlm = self._vlm_class.from_pretrained(
            self._vlm_model_id,
            torch_dtype="auto",
            device_map=device,
            trust_remote_code=True,
        )
        self.processor = AutoProcessor.from_pretrained(
            self._vlm_model_id,
            trust_remote_code=True,
        )
        print("[assistant] VLM lastet.")

        # --- Last STT ---
        print("[assistant] Laster STT: %s ..." % whisper_id)
        stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
            whisper_id,
            torch_dtype=self.torch_dtype,
            low_cpu_mem_usage=True,
            use_safetensors=True,
        )
        stt_model.to(
            device if device != "auto" else ("cuda" if torch.cuda.is_available() else "cpu")
        )
        stt_processor = WhisperProcessor.from_pretrained(whisper_id)
        self.stt_pipe = pipeline(
            "automatic-speech-recognition",
            model=stt_model,
            tokenizer=stt_processor.tokenizer,
            feature_extractor=stt_processor.feature_extractor,
            torch_dtype=self.torch_dtype,
            device=0 if torch.cuda.is_available() else -1,
        )
        print("[assistant] STT lastet.")

    def transcribe_audio(self, audio_bytes: bytes) -> str:
        """Transkriber WAV bytes til norsk tekst."""
        import tempfile
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            f.write(audio_bytes)
            tmp_path = f.name

        result = self.stt_pipe(
            tmp_path,
            generate_kwargs={"language": "no", "task": "transcribe"},
        )
        os.remove(tmp_path)
        text = result["text"].strip()
        print("[stt] Transkribert: %s" % text)
        return text

    def ask_with_image(
        self,
        image: Image.Image,
        text: str,
        max_new_tokens: int = 512,
    ) -> str:
        """Send screenshot + tekst til VLM og returner svar."""

        system_prompt = (
            "Du er en hjelpsom, norsk AI-assistent som ser brukerens skjermbilde. "
            "Svar konsist, presist og på norsk. Hvis spørsmålet er på engelsk, svar på engelsk."
        )

        # Bygg bilde-element
        if self._supports_pixels:
            image_elem = {
                "type": "image",
                "image": image,
                "min_pixels": 50176,
                "max_pixels": 501760,
            }
        else:
            # Qwen2-VL bruker annen syntaks
            image_elem = {
                "type": "image",
                "image": image,
            }

        messages = [
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": [
                    image_elem,
                    {"type": "text", "text": text},
                ],
            },
        ]

        text_input = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = self.processor(
            text=[text_input],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to(self.vlm.device)

        generated_ids = self.vlm.generate(**inputs, max_new_tokens=max_new_tokens)
        generated_ids_trimmed = [
            out_ids[len(in_ids):]
            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        response = self.processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )[0]

        print("[vlm] Svar: %s..." % response[:120])
        return response.strip()