carbonx
/

buddy-desktop

Model card Files Files and versions

xet

Community

carbonx commited on 10 days ago

Commit

b28ee4c

verified ·

1 Parent(s): 45ee2cb

Add configurable models with auto VRAM detection

Browse files

Files changed (1) hide show

vision_llm.py +106 -28

vision_llm.py CHANGED Viewed

@@ -1,11 +1,11 @@
-"""Multimodal Vision-Language Model (Qwen2.5-VL) wrapper."""
 import os
 import torch
 from PIL import Image
-from typing import Optional, Union, List
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     AutoModelForSpeechSeq2Seq,
     AutoProcessor as WhisperProcessor,
@@ -14,44 +14,112 @@ from transformers import (
 from qwen_vl_utils import process_vision_info
 class MultimodalAssistant:
     """
-    Combines:
-      - Qwen2.5-VL-7B for vision+language understanding
-      - Whisper for STT
     """
     def __init__(
         self,
-        vlm_model_id: str = "Qwen/Qwen2.5-VL-7B-Instruct",
-        whisper_model_id: str = "openai/whisper-large-v3",
-        device: str = "auto",
     ):
         self.device = device
         self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-        print("[assistant] Loading VLM: %s ..." % vlm_model_id)
-        self.vlm = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-            vlm_model_id,
             torch_dtype="auto",
             device_map=device,
             trust_remote_code=True,
         )
         self.processor = AutoProcessor.from_pretrained(
-            vlm_model_id,
             trust_remote_code=True,
         )
-        print("[assistant] VLM loaded.")
-        print("[assistant] Loading STT: %s ..." % whisper_model_id)
         stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            whisper_model_id,
             torch_dtype=self.torch_dtype,
             low_cpu_mem_usage=True,
             use_safetensors=True,
         )
-        stt_model.to(self.device if self.device != "auto" else ("cuda" if torch.cuda.is_available() else "cpu"))
-        stt_processor = WhisperProcessor.from_pretrained(whisper_model_id)
         self.stt_pipe = pipeline(
             "automatic-speech-recognition",
             model=stt_model,
@@ -60,10 +128,10 @@ class MultimodalAssistant:
             torch_dtype=self.torch_dtype,
             device=0 if torch.cuda.is_available() else -1,
         )
-        print("[assistant] STT loaded.")
     def transcribe_audio(self, audio_bytes: bytes) -> str:
-        """Transcribe WAV bytes to Norwegian text."""
         import tempfile
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             f.write(audio_bytes)
@@ -75,7 +143,7 @@ class MultimodalAssistant:
         )
         os.remove(tmp_path)
         text = result["text"].strip()
-        print("[stt] Transcribed: %s" % text)
         return text
     def ask_with_image(
@@ -84,24 +152,34 @@ class MultimodalAssistant:
         text: str,
         max_new_tokens: int = 512,
     ) -> str:
-        """Send a screenshot + text prompt to Qwen2.5-VL and return response."""
         system_prompt = (
             "Du er en hjelpsom, norsk AI-assistent som ser brukerens skjermbilde. "
             "Svar konsist, presist og på norsk. Hvis spørsmålet er på engelsk, svar på engelsk."
         )
         messages = [
             {"role": "system", "content": system_prompt},
             {
                 "role": "user",
                 "content": [
-                    {
-                        "type": "image",
-                        "image": image,
-                        "min_pixels": 50176,
-                        "max_pixels": 501760,
-                    },
                     {"type": "text", "text": text},
                 ],
             },
@@ -131,5 +209,5 @@ class MultimodalAssistant:
             clean_up_tokenization_spaces=False,
         )[0]
-        print("[vlm] Response: %s..." % response[:120])
         return response.strip()

+"""Multimodal Vision-Language Model wrapper med konfigurerbar modell."""
 import os
 import torch
 from PIL import Image
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
+    Qwen2VLForConditionalGeneration,
     AutoProcessor,
     AutoModelForSpeechSeq2Seq,
     AutoProcessor as WhisperProcessor,
 from qwen_vl_utils import process_vision_info
+# Modellkart: vennlig navn -> (model_id, min_vram_gb, klass, processor)
+MODEL_REGISTRY = {
+    "qwen2.5-vl-3b": {
+        "id": "Qwen/Qwen2.5-VL-3B-Instruct",
+        "min_vram": 7,
+        "class": Qwen2_5_VLForConditionalGeneration,
+        "supports_pixels": True,  # Qwen2.5 støtter min/max_pixels
+    },
+    "qwen2.5-vl-7b": {
+        "id": "Qwen/Qwen2.5-VL-7B-Instruct",
+        "min_vram": 16,
+        "class": Qwen2_5_VLForConditionalGeneration,
+        "supports_pixels": True,
+    },
+    "qwen2-vl-2b": {
+        "id": "Qwen/Qwen2-VL-2B-Instruct",
+        "min_vram": 5,
+        "class": Qwen2VLForConditionalGeneration,
+        "supports_pixels": False,  # Qwen2 bruker annen syntaks
+    },
+}
+def _detect_gpu_vram() -> int:
+    """Returner estimert GPU VRAM i GB."""
+    if not torch.cuda.is_available():
+        return 0
+    return torch.cuda.get_device_properties(0).total_memory // (1024 ** 3)
+def _pick_model(preferred: str = None) -> dict:
+    """Velg beste modell basert på VRAM og preferanse."""
+    vram = _detect_gpu_vram()
+    print("[config] Oppdaget VRAM: %d GB" % vram)
+    if preferred and preferred in MODEL_REGISTRY:
+        model = MODEL_REGISTRY[preferred]
+        if vram >= model["min_vram"]:
+            print("[config] Bruker foretrukken modell: %s" % preferred)
+            return model
+        print("[config] Advarsel: %s trenger %dGB, har bare %dGB" % (preferred, model["min_vram"], vram))
+    # Auto-velg største modell som passer i VRAM
+    for name in ["qwen2.5-vl-7b", "qwen2.5-vl-3b", "qwen2-vl-2b"]:
+        model = MODEL_REGISTRY[name]
+        if vram >= model["min_vram"]:
+            print("[config] Auto-valgte modell: %s (%s)" % (name, model["id"]))
+            return model
+    print("[config] Fallback: qwen2-vl-2b (minst VRAM-krav)")
+    return MODEL_REGISTRY["qwen2-vl-2b"]
 class MultimodalAssistant:
     """
+    Konfigurerbar multimodal assistent.
+    Miljøvariabler:
+      BUDDY_VLM_MODEL  -- qwen2.5-vl-3b | qwen2.5-vl-7b | qwen2-vl-2b
+      BUDDY_STT_MODEL  -- openai/whisper-large-v3 | openai/whisper-medium | openai/whisper-small
+      BUDDY_DEVICE     -- auto | cuda | cpu
     """
     def __init__(
         self,
+        vlm_model: str = None,
+        whisper_model: str = None,
+        device: str = None,
     ):
+        # --- Konfigurer ---
+        vlm_cfg = _pick_model(vlm_model or os.environ.get("BUDDY_VLM_MODEL"))
+        whisper_id = whisper_model or os.environ.get("BUDDY_STT_MODEL", "openai/whisper-large-v3")
+        device = device or os.environ.get("BUDDY_DEVICE", "auto")
+        self._vlm_model_id = vlm_cfg["id"]
+        self._vlm_class = vlm_cfg["class"]
+        self._supports_pixels = vlm_cfg["supports_pixels"]
         self.device = device
         self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        # --- Last VLM ---
+        print("[assistant] Laster VLM: %s ..." % self._vlm_model_id)
+        self.vlm = self._vlm_class.from_pretrained(
+            self._vlm_model_id,
             torch_dtype="auto",
             device_map=device,
             trust_remote_code=True,
         )
         self.processor = AutoProcessor.from_pretrained(
+            self._vlm_model_id,
             trust_remote_code=True,
         )
+        print("[assistant] VLM lastet.")
+        # --- Last STT ---
+        print("[assistant] Laster STT: %s ..." % whisper_id)
         stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            whisper_id,
             torch_dtype=self.torch_dtype,
             low_cpu_mem_usage=True,
             use_safetensors=True,
         )
+        stt_model.to(
+            device if device != "auto" else ("cuda" if torch.cuda.is_available() else "cpu")
+        )
+        stt_processor = WhisperProcessor.from_pretrained(whisper_id)
         self.stt_pipe = pipeline(
             "automatic-speech-recognition",
             model=stt_model,
             torch_dtype=self.torch_dtype,
             device=0 if torch.cuda.is_available() else -1,
         )
+        print("[assistant] STT lastet.")
     def transcribe_audio(self, audio_bytes: bytes) -> str:
+        """Transkriber WAV bytes til norsk tekst."""
         import tempfile
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
             f.write(audio_bytes)
         )
         os.remove(tmp_path)
         text = result["text"].strip()
+        print("[stt] Transkribert: %s" % text)
         return text
     def ask_with_image(
         text: str,
         max_new_tokens: int = 512,
     ) -> str:
+        """Send screenshot + tekst til VLM og returner svar."""
         system_prompt = (
             "Du er en hjelpsom, norsk AI-assistent som ser brukerens skjermbilde. "
             "Svar konsist, presist og på norsk. Hvis spørsmålet er på engelsk, svar på engelsk."
         )
+        # Bygg bilde-element
+        if self._supports_pixels:
+            image_elem = {
+                "type": "image",
+                "image": image,
+                "min_pixels": 50176,
+                "max_pixels": 501760,
+            }
+        else:
+            # Qwen2-VL bruker annen syntaks
+            image_elem = {
+                "type": "image",
+                "image": image,
+            }
         messages = [
             {"role": "system", "content": system_prompt},
             {
                 "role": "user",
                 "content": [
+                    image_elem,
                     {"type": "text", "text": text},
                 ],
             },
             clean_up_tokenization_spaces=False,
         )[0]
+        print("[vlm] Svar: %s..." % response[:120])
         return response.strip()