carbonx commited on
Commit
b28ee4c
·
verified ·
1 Parent(s): 45ee2cb

Add configurable models with auto VRAM detection

Browse files
Files changed (1) hide show
  1. vision_llm.py +106 -28
vision_llm.py CHANGED
@@ -1,11 +1,11 @@
1
- """Multimodal Vision-Language Model (Qwen2.5-VL) wrapper."""
2
  import os
3
  import torch
4
  from PIL import Image
5
- from typing import Optional, Union, List
6
 
7
  from transformers import (
8
  Qwen2_5_VLForConditionalGeneration,
 
9
  AutoProcessor,
10
  AutoModelForSpeechSeq2Seq,
11
  AutoProcessor as WhisperProcessor,
@@ -14,44 +14,112 @@ from transformers import (
14
  from qwen_vl_utils import process_vision_info
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  class MultimodalAssistant:
18
  """
19
- Combines:
20
- - Qwen2.5-VL-7B for vision+language understanding
21
- - Whisper for STT
 
 
 
22
  """
23
 
24
  def __init__(
25
  self,
26
- vlm_model_id: str = "Qwen/Qwen2.5-VL-7B-Instruct",
27
- whisper_model_id: str = "openai/whisper-large-v3",
28
- device: str = "auto",
29
  ):
 
 
 
 
 
 
 
 
30
  self.device = device
31
  self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
32
 
33
- print("[assistant] Loading VLM: %s ..." % vlm_model_id)
34
- self.vlm = Qwen2_5_VLForConditionalGeneration.from_pretrained(
35
- vlm_model_id,
 
36
  torch_dtype="auto",
37
  device_map=device,
38
  trust_remote_code=True,
39
  )
40
  self.processor = AutoProcessor.from_pretrained(
41
- vlm_model_id,
42
  trust_remote_code=True,
43
  )
44
- print("[assistant] VLM loaded.")
45
 
46
- print("[assistant] Loading STT: %s ..." % whisper_model_id)
 
47
  stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
48
- whisper_model_id,
49
  torch_dtype=self.torch_dtype,
50
  low_cpu_mem_usage=True,
51
  use_safetensors=True,
52
  )
53
- stt_model.to(self.device if self.device != "auto" else ("cuda" if torch.cuda.is_available() else "cpu"))
54
- stt_processor = WhisperProcessor.from_pretrained(whisper_model_id)
 
 
55
  self.stt_pipe = pipeline(
56
  "automatic-speech-recognition",
57
  model=stt_model,
@@ -60,10 +128,10 @@ class MultimodalAssistant:
60
  torch_dtype=self.torch_dtype,
61
  device=0 if torch.cuda.is_available() else -1,
62
  )
63
- print("[assistant] STT loaded.")
64
 
65
  def transcribe_audio(self, audio_bytes: bytes) -> str:
66
- """Transcribe WAV bytes to Norwegian text."""
67
  import tempfile
68
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
69
  f.write(audio_bytes)
@@ -75,7 +143,7 @@ class MultimodalAssistant:
75
  )
76
  os.remove(tmp_path)
77
  text = result["text"].strip()
78
- print("[stt] Transcribed: %s" % text)
79
  return text
80
 
81
  def ask_with_image(
@@ -84,24 +152,34 @@ class MultimodalAssistant:
84
  text: str,
85
  max_new_tokens: int = 512,
86
  ) -> str:
87
- """Send a screenshot + text prompt to Qwen2.5-VL and return response."""
88
 
89
  system_prompt = (
90
  "Du er en hjelpsom, norsk AI-assistent som ser brukerens skjermbilde. "
91
  "Svar konsist, presist og på norsk. Hvis spørsmålet er på engelsk, svar på engelsk."
92
  )
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  messages = [
95
  {"role": "system", "content": system_prompt},
96
  {
97
  "role": "user",
98
  "content": [
99
- {
100
- "type": "image",
101
- "image": image,
102
- "min_pixels": 50176,
103
- "max_pixels": 501760,
104
- },
105
  {"type": "text", "text": text},
106
  ],
107
  },
@@ -131,5 +209,5 @@ class MultimodalAssistant:
131
  clean_up_tokenization_spaces=False,
132
  )[0]
133
 
134
- print("[vlm] Response: %s..." % response[:120])
135
  return response.strip()
 
1
+ """Multimodal Vision-Language Model wrapper med konfigurerbar modell."""
2
  import os
3
  import torch
4
  from PIL import Image
 
5
 
6
  from transformers import (
7
  Qwen2_5_VLForConditionalGeneration,
8
+ Qwen2VLForConditionalGeneration,
9
  AutoProcessor,
10
  AutoModelForSpeechSeq2Seq,
11
  AutoProcessor as WhisperProcessor,
 
14
  from qwen_vl_utils import process_vision_info
15
 
16
 
17
+ # Modellkart: vennlig navn -> (model_id, min_vram_gb, klass, processor)
18
+ MODEL_REGISTRY = {
19
+ "qwen2.5-vl-3b": {
20
+ "id": "Qwen/Qwen2.5-VL-3B-Instruct",
21
+ "min_vram": 7,
22
+ "class": Qwen2_5_VLForConditionalGeneration,
23
+ "supports_pixels": True, # Qwen2.5 støtter min/max_pixels
24
+ },
25
+ "qwen2.5-vl-7b": {
26
+ "id": "Qwen/Qwen2.5-VL-7B-Instruct",
27
+ "min_vram": 16,
28
+ "class": Qwen2_5_VLForConditionalGeneration,
29
+ "supports_pixels": True,
30
+ },
31
+ "qwen2-vl-2b": {
32
+ "id": "Qwen/Qwen2-VL-2B-Instruct",
33
+ "min_vram": 5,
34
+ "class": Qwen2VLForConditionalGeneration,
35
+ "supports_pixels": False, # Qwen2 bruker annen syntaks
36
+ },
37
+ }
38
+
39
+
40
+ def _detect_gpu_vram() -> int:
41
+ """Returner estimert GPU VRAM i GB."""
42
+ if not torch.cuda.is_available():
43
+ return 0
44
+ return torch.cuda.get_device_properties(0).total_memory // (1024 ** 3)
45
+
46
+
47
+ def _pick_model(preferred: str = None) -> dict:
48
+ """Velg beste modell basert på VRAM og preferanse."""
49
+ vram = _detect_gpu_vram()
50
+ print("[config] Oppdaget VRAM: %d GB" % vram)
51
+
52
+ if preferred and preferred in MODEL_REGISTRY:
53
+ model = MODEL_REGISTRY[preferred]
54
+ if vram >= model["min_vram"]:
55
+ print("[config] Bruker foretrukken modell: %s" % preferred)
56
+ return model
57
+ print("[config] Advarsel: %s trenger %dGB, har bare %dGB" % (preferred, model["min_vram"], vram))
58
+
59
+ # Auto-velg største modell som passer i VRAM
60
+ for name in ["qwen2.5-vl-7b", "qwen2.5-vl-3b", "qwen2-vl-2b"]:
61
+ model = MODEL_REGISTRY[name]
62
+ if vram >= model["min_vram"]:
63
+ print("[config] Auto-valgte modell: %s (%s)" % (name, model["id"]))
64
+ return model
65
+
66
+ print("[config] Fallback: qwen2-vl-2b (minst VRAM-krav)")
67
+ return MODEL_REGISTRY["qwen2-vl-2b"]
68
+
69
+
70
  class MultimodalAssistant:
71
  """
72
+ Konfigurerbar multimodal assistent.
73
+
74
+ Miljøvariabler:
75
+ BUDDY_VLM_MODEL -- qwen2.5-vl-3b | qwen2.5-vl-7b | qwen2-vl-2b
76
+ BUDDY_STT_MODEL -- openai/whisper-large-v3 | openai/whisper-medium | openai/whisper-small
77
+ BUDDY_DEVICE -- auto | cuda | cpu
78
  """
79
 
80
  def __init__(
81
  self,
82
+ vlm_model: str = None,
83
+ whisper_model: str = None,
84
+ device: str = None,
85
  ):
86
+ # --- Konfigurer ---
87
+ vlm_cfg = _pick_model(vlm_model or os.environ.get("BUDDY_VLM_MODEL"))
88
+ whisper_id = whisper_model or os.environ.get("BUDDY_STT_MODEL", "openai/whisper-large-v3")
89
+ device = device or os.environ.get("BUDDY_DEVICE", "auto")
90
+
91
+ self._vlm_model_id = vlm_cfg["id"]
92
+ self._vlm_class = vlm_cfg["class"]
93
+ self._supports_pixels = vlm_cfg["supports_pixels"]
94
  self.device = device
95
  self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
96
 
97
+ # --- Last VLM ---
98
+ print("[assistant] Laster VLM: %s ..." % self._vlm_model_id)
99
+ self.vlm = self._vlm_class.from_pretrained(
100
+ self._vlm_model_id,
101
  torch_dtype="auto",
102
  device_map=device,
103
  trust_remote_code=True,
104
  )
105
  self.processor = AutoProcessor.from_pretrained(
106
+ self._vlm_model_id,
107
  trust_remote_code=True,
108
  )
109
+ print("[assistant] VLM lastet.")
110
 
111
+ # --- Last STT ---
112
+ print("[assistant] Laster STT: %s ..." % whisper_id)
113
  stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
114
+ whisper_id,
115
  torch_dtype=self.torch_dtype,
116
  low_cpu_mem_usage=True,
117
  use_safetensors=True,
118
  )
119
+ stt_model.to(
120
+ device if device != "auto" else ("cuda" if torch.cuda.is_available() else "cpu")
121
+ )
122
+ stt_processor = WhisperProcessor.from_pretrained(whisper_id)
123
  self.stt_pipe = pipeline(
124
  "automatic-speech-recognition",
125
  model=stt_model,
 
128
  torch_dtype=self.torch_dtype,
129
  device=0 if torch.cuda.is_available() else -1,
130
  )
131
+ print("[assistant] STT lastet.")
132
 
133
  def transcribe_audio(self, audio_bytes: bytes) -> str:
134
+ """Transkriber WAV bytes til norsk tekst."""
135
  import tempfile
136
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
137
  f.write(audio_bytes)
 
143
  )
144
  os.remove(tmp_path)
145
  text = result["text"].strip()
146
+ print("[stt] Transkribert: %s" % text)
147
  return text
148
 
149
  def ask_with_image(
 
152
  text: str,
153
  max_new_tokens: int = 512,
154
  ) -> str:
155
+ """Send screenshot + tekst til VLM og returner svar."""
156
 
157
  system_prompt = (
158
  "Du er en hjelpsom, norsk AI-assistent som ser brukerens skjermbilde. "
159
  "Svar konsist, presist og på norsk. Hvis spørsmålet er på engelsk, svar på engelsk."
160
  )
161
 
162
+ # Bygg bilde-element
163
+ if self._supports_pixels:
164
+ image_elem = {
165
+ "type": "image",
166
+ "image": image,
167
+ "min_pixels": 50176,
168
+ "max_pixels": 501760,
169
+ }
170
+ else:
171
+ # Qwen2-VL bruker annen syntaks
172
+ image_elem = {
173
+ "type": "image",
174
+ "image": image,
175
+ }
176
+
177
  messages = [
178
  {"role": "system", "content": system_prompt},
179
  {
180
  "role": "user",
181
  "content": [
182
+ image_elem,
 
 
 
 
 
183
  {"type": "text", "text": text},
184
  ],
185
  },
 
209
  clean_up_tokenization_spaces=False,
210
  )[0]
211
 
212
+ print("[vlm] Svar: %s..." % response[:120])
213
  return response.strip()