| """Multimodal Vision-Language Model wrapper med konfigurerbar modell.""" |
| import os |
| import torch |
| from PIL import Image |
|
|
| from transformers import ( |
| Qwen2_5_VLForConditionalGeneration, |
| Qwen2VLForConditionalGeneration, |
| AutoProcessor, |
| AutoModelForSpeechSeq2Seq, |
| AutoProcessor as WhisperProcessor, |
| pipeline, |
| ) |
| from qwen_vl_utils import process_vision_info |
|
|
|
|
| |
| MODEL_REGISTRY = { |
| "qwen2.5-vl-3b": { |
| "id": "Qwen/Qwen2.5-VL-3B-Instruct", |
| "min_vram": 7, |
| "class": Qwen2_5_VLForConditionalGeneration, |
| "supports_pixels": True, |
| }, |
| "qwen2.5-vl-7b": { |
| "id": "Qwen/Qwen2.5-VL-7B-Instruct", |
| "min_vram": 16, |
| "class": Qwen2_5_VLForConditionalGeneration, |
| "supports_pixels": True, |
| }, |
| "qwen2-vl-2b": { |
| "id": "Qwen/Qwen2-VL-2B-Instruct", |
| "min_vram": 5, |
| "class": Qwen2VLForConditionalGeneration, |
| "supports_pixels": False, |
| }, |
| } |
|
|
|
|
| def _detect_gpu_vram() -> int: |
| """Returner estimert GPU VRAM i GB.""" |
| if not torch.cuda.is_available(): |
| return 0 |
| return torch.cuda.get_device_properties(0).total_memory // (1024 ** 3) |
|
|
|
|
| def _pick_model(preferred: str = None) -> dict: |
| """Velg beste modell basert på VRAM og preferanse.""" |
| vram = _detect_gpu_vram() |
| print("[config] Oppdaget VRAM: %d GB" % vram) |
|
|
| if preferred and preferred in MODEL_REGISTRY: |
| model = MODEL_REGISTRY[preferred] |
| if vram >= model["min_vram"]: |
| print("[config] Bruker foretrukken modell: %s" % preferred) |
| return model |
| print("[config] Advarsel: %s trenger %dGB, har bare %dGB" % (preferred, model["min_vram"], vram)) |
|
|
| |
| for name in ["qwen2.5-vl-7b", "qwen2.5-vl-3b", "qwen2-vl-2b"]: |
| model = MODEL_REGISTRY[name] |
| if vram >= model["min_vram"]: |
| print("[config] Auto-valgte modell: %s (%s)" % (name, model["id"])) |
| return model |
|
|
| print("[config] Fallback: qwen2-vl-2b (minst VRAM-krav)") |
| return MODEL_REGISTRY["qwen2-vl-2b"] |
|
|
|
|
| class MultimodalAssistant: |
| """ |
| Konfigurerbar multimodal assistent. |
| |
| Miljøvariabler: |
| BUDDY_VLM_MODEL -- qwen2.5-vl-3b | qwen2.5-vl-7b | qwen2-vl-2b |
| BUDDY_STT_MODEL -- openai/whisper-large-v3 | openai/whisper-medium | openai/whisper-small |
| BUDDY_DEVICE -- auto | cuda | cpu |
| """ |
|
|
| def __init__( |
| self, |
| vlm_model: str = None, |
| whisper_model: str = None, |
| device: str = None, |
| ): |
| |
| vlm_cfg = _pick_model(vlm_model or os.environ.get("BUDDY_VLM_MODEL")) |
| whisper_id = whisper_model or os.environ.get("BUDDY_STT_MODEL", "openai/whisper-large-v3") |
| device = device or os.environ.get("BUDDY_DEVICE", "auto") |
|
|
| self._vlm_model_id = vlm_cfg["id"] |
| self._vlm_class = vlm_cfg["class"] |
| self._supports_pixels = vlm_cfg["supports_pixels"] |
| self.device = device |
| self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 |
|
|
| |
| print("[assistant] Laster VLM: %s ..." % self._vlm_model_id) |
| self.vlm = self._vlm_class.from_pretrained( |
| self._vlm_model_id, |
| torch_dtype="auto", |
| device_map=device, |
| trust_remote_code=True, |
| ) |
| self.processor = AutoProcessor.from_pretrained( |
| self._vlm_model_id, |
| trust_remote_code=True, |
| ) |
| print("[assistant] VLM lastet.") |
|
|
| |
| print("[assistant] Laster STT: %s ..." % whisper_id) |
| stt_model = AutoModelForSpeechSeq2Seq.from_pretrained( |
| whisper_id, |
| torch_dtype=self.torch_dtype, |
| low_cpu_mem_usage=True, |
| use_safetensors=True, |
| ) |
| stt_model.to( |
| device if device != "auto" else ("cuda" if torch.cuda.is_available() else "cpu") |
| ) |
| stt_processor = WhisperProcessor.from_pretrained(whisper_id) |
| self.stt_pipe = pipeline( |
| "automatic-speech-recognition", |
| model=stt_model, |
| tokenizer=stt_processor.tokenizer, |
| feature_extractor=stt_processor.feature_extractor, |
| torch_dtype=self.torch_dtype, |
| device=0 if torch.cuda.is_available() else -1, |
| ) |
| print("[assistant] STT lastet.") |
|
|
| def transcribe_audio(self, audio_bytes: bytes) -> str: |
| """Transkriber WAV bytes til norsk tekst.""" |
| import tempfile |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: |
| f.write(audio_bytes) |
| tmp_path = f.name |
|
|
| result = self.stt_pipe( |
| tmp_path, |
| generate_kwargs={"language": "no", "task": "transcribe"}, |
| ) |
| os.remove(tmp_path) |
| text = result["text"].strip() |
| print("[stt] Transkribert: %s" % text) |
| return text |
|
|
| def ask_with_image( |
| self, |
| image: Image.Image, |
| text: str, |
| max_new_tokens: int = 512, |
| ) -> str: |
| """Send screenshot + tekst til VLM og returner svar.""" |
|
|
| system_prompt = ( |
| "Du er en hjelpsom, norsk AI-assistent som ser brukerens skjermbilde. " |
| "Svar konsist, presist og på norsk. Hvis spørsmålet er på engelsk, svar på engelsk." |
| ) |
|
|
| |
| if self._supports_pixels: |
| image_elem = { |
| "type": "image", |
| "image": image, |
| "min_pixels": 50176, |
| "max_pixels": 501760, |
| } |
| else: |
| |
| image_elem = { |
| "type": "image", |
| "image": image, |
| } |
|
|
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| { |
| "role": "user", |
| "content": [ |
| image_elem, |
| {"type": "text", "text": text}, |
| ], |
| }, |
| ] |
|
|
| text_input = self.processor.apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True |
| ) |
| image_inputs, video_inputs = process_vision_info(messages) |
| inputs = self.processor( |
| text=[text_input], |
| images=image_inputs, |
| videos=video_inputs, |
| padding=True, |
| return_tensors="pt", |
| ) |
| inputs = inputs.to(self.vlm.device) |
|
|
| generated_ids = self.vlm.generate(**inputs, max_new_tokens=max_new_tokens) |
| generated_ids_trimmed = [ |
| out_ids[len(in_ids):] |
| for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
| ] |
| response = self.processor.batch_decode( |
| generated_ids_trimmed, |
| skip_special_tokens=True, |
| clean_up_tokenization_spaces=False, |
| )[0] |
|
|
| print("[vlm] Svar: %s..." % response[:120]) |
| return response.strip() |
|
|