russellyq
/

ObsDrive

Model card Files Files and versions

russellyq commited on 17 days ago

Commit

3243c68

·

verified ·

1 Parent(s): ea86e6c

Create README.md

Files changed (1) hide show

README.md +120 -0

README.md ADDED Viewed

	@@ -0,0 +1,120 @@

+# ObsDrive
+ObsDrive is a vision-language model designed for multimodal autonomous driving understanding, supporting camera, LiDAR BEV, and RADAR BEV inputs.
+---
+### 📦 Requirements
+```bash
+pip install torch transformers accelerate qwen-vl-utils flash-attn
+```
+## 🚀 Inference
+```bash
+import torch
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from qwen_vl_utils import process_vision_info
+MODEL_PATH = "russellyq/ObsDrive/XXX"
+class ObsDrive:
+    def __init__(self, model_path=MODEL_PATH):
+        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_path,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            attn_implementation="flash_attention_2",
+        )
+        self.MAX_NEW_TOKENS = 4096
+        self.processor = AutoProcessor.from_pretrained(
+            model_path,
+        )
+    def chat(self, question, image=None, system_prompt=None):
+        message_content = []
+        # Handle image input
+        if isinstance(image, str):
+            message_content.append({"type": "image", "image": f"file://{image}"})
+        elif isinstance(image, list):
+            for img_path in image:
+                message_content.append({"type": "image", "image": f"file://{img_path}"})
+        # Add text input
+        text = system_prompt + "\n" + question if system_prompt is not None else question
+        message_content.append({"type": "text", "text": text})
+        messages = [
+            {
+                "role": "user",
+                "content": message_content,
+            }
+        ]
+        # Apply chat template
+        text_prompt = self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text_prompt],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to("cuda")
+        # Generate output
+        generated_ids = self.model.generate(
+            **inputs,
+            max_new_tokens=self.MAX_NEW_TOKENS,
+            do_sample=False,
+        )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):]
+            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+        return output_text
+if __name__ == "__main__":
+    model = ObsDrive()
+    question = "Please describe the driving scene."
+    image = "/path/to/image.png"
+    answer = model.chat(question, image=image)
+    print(answer)
+```
+## 🖼️ Multi-image Inference
+```bash
+question = "Analyze the scene using all modalities."
+images = [
+    "/path/to/camera.png",
+    "/path/to/lidar_bev.png",
+    "/path/to/radar_bev.png",
+]
+answer = model.chat(question, image=images)
+print(answer)
+```