carbonx
/

buddy-desktop

Model card Files Files and versions

xet

Community

carbonx commited on 11 days ago

Commit

9f45f5e

verified ·

1 Parent(s): 2e7e0eb

Add vision_llm.py

Browse files

Files changed (1) hide show

vision_llm.py +135 -0

vision_llm.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""Multimodal Vision-Language Model (Qwen2.5-VL) wrapper."""
+import os
+import torch
+from PIL import Image
+from typing import Optional, Union, List
+from transformers import (
+    Qwen2_5_VLForConditionalGeneration,
+    AutoProcessor,
+    AutoModelForSpeechSeq2Seq,
+    AutoProcessor as WhisperProcessor,
+    pipeline,
+)
+from qwen_vl_utils import process_vision_info
+class MultimodalAssistant:
+    """
+    Combines:
+      - Qwen2.5-VL-7B for vision+language understanding
+      - Whisper for STT
+    """
+    def __init__(
+        self,
+        vlm_model_id: str = "Qwen/Qwen2.5-VL-7B-Instruct",
+        whisper_model_id: str = "openai/whisper-large-v3",
+        device: str = "auto",
+    ):
+        self.device = device
+        self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        print("[assistant] Loading VLM: %s ..." % vlm_model_id)
+        self.vlm = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            vlm_model_id,
+            torch_dtype="auto",
+            device_map=device,
+            trust_remote_code=True,
+        )
+        self.processor = AutoProcessor.from_pretrained(
+            vlm_model_id,
+            trust_remote_code=True,
+        )
+        print("[assistant] VLM loaded.")
+        print("[assistant] Loading STT: %s ..." % whisper_model_id)
+        stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            whisper_model_id,
+            torch_dtype=self.torch_dtype,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        )
+        stt_model.to(self.device if self.device != "auto" else ("cuda" if torch.cuda.is_available() else "cpu"))
+        stt_processor = WhisperProcessor.from_pretrained(whisper_model_id)
+        self.stt_pipe = pipeline(
+            "automatic-speech-recognition",
+            model=stt_model,
+            tokenizer=stt_processor.tokenizer,
+            feature_extractor=stt_processor.feature_extractor,
+            torch_dtype=self.torch_dtype,
+            device=0 if torch.cuda.is_available() else -1,
+        )
+        print("[assistant] STT loaded.")
+    def transcribe_audio(self, audio_bytes: bytes) -> str:
+        """Transcribe WAV bytes to Norwegian text."""
+        import tempfile
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            f.write(audio_bytes)
+            tmp_path = f.name
+        result = self.stt_pipe(
+            tmp_path,
+            generate_kwargs={"language": "no", "task": "transcribe"},
+        )
+        os.remove(tmp_path)
+        text = result["text"].strip()
+        print("[stt] Transcribed: %s" % text)
+        return text
+    def ask_with_image(
+        self,
+        image: Image.Image,
+        text: str,
+        max_new_tokens: int = 512,
+    ) -> str:
+        """Send a screenshot + text prompt to Qwen2.5-VL and return response."""
+        system_prompt = (
+            "Du er en hjelpsom, norsk AI-assistent som ser brukerens skjermbilde. "
+            "Svar konsist, presist og på norsk. Hvis spørsmålet er på engelsk, svar på engelsk."
+        )
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": image,
+                        "min_pixels": 50176,
+                        "max_pixels": 501760,
+                    },
+                    {"type": "text", "text": text},
+                ],
+            },
+        ]
+        text_input = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text_input],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.vlm.device)
+        generated_ids = self.vlm.generate(**inputs, max_new_tokens=max_new_tokens)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):]
+            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        response = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+        print("[vlm] Response: %s..." % response[:120])
+        return response.strip()