russellyq commited on
Commit
3243c68
·
verified ·
1 Parent(s): ea86e6c

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +120 -0
README.md ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ObsDrive
2
+
3
+ ObsDrive is a vision-language model designed for multimodal autonomous driving understanding, supporting camera, LiDAR BEV, and RADAR BEV inputs.
4
+
5
+ ---
6
+
7
+
8
+ ### 📦 Requirements
9
+
10
+ ```bash
11
+ pip install torch transformers accelerate qwen-vl-utils flash-attn
12
+ ```
13
+
14
+ ## 🚀 Inference
15
+
16
+ ```bash
17
+ import torch
18
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
19
+ from qwen_vl_utils import process_vision_info
20
+
21
+ MODEL_PATH = "russellyq/ObsDrive/XXX"
22
+
23
+ class ObsDrive:
24
+ def __init__(self, model_path=MODEL_PATH):
25
+ self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
26
+ model_path,
27
+ torch_dtype=torch.bfloat16,
28
+ device_map="auto",
29
+ attn_implementation="flash_attention_2",
30
+ )
31
+ self.MAX_NEW_TOKENS = 4096
32
+ self.processor = AutoProcessor.from_pretrained(
33
+ model_path,
34
+ )
35
+
36
+ def chat(self, question, image=None, system_prompt=None):
37
+ message_content = []
38
+
39
+ # Handle image input
40
+ if isinstance(image, str):
41
+ message_content.append({"type": "image", "image": f"file://{image}"})
42
+ elif isinstance(image, list):
43
+ for img_path in image:
44
+ message_content.append({"type": "image", "image": f"file://{img_path}"})
45
+
46
+ # Add text input
47
+ text = system_prompt + "\n" + question if system_prompt is not None else question
48
+ message_content.append({"type": "text", "text": text})
49
+
50
+ messages = [
51
+ {
52
+ "role": "user",
53
+ "content": message_content,
54
+ }
55
+ ]
56
+
57
+ # Apply chat template
58
+ text_prompt = self.processor.apply_chat_template(
59
+ messages,
60
+ tokenize=False,
61
+ add_generation_prompt=True,
62
+ )
63
+
64
+ image_inputs, video_inputs = process_vision_info(messages)
65
+
66
+ inputs = self.processor(
67
+ text=[text_prompt],
68
+ images=image_inputs,
69
+ videos=video_inputs,
70
+ padding=True,
71
+ return_tensors="pt",
72
+ )
73
+
74
+ inputs = inputs.to("cuda")
75
+
76
+ # Generate output
77
+ generated_ids = self.model.generate(
78
+ **inputs,
79
+ max_new_tokens=self.MAX_NEW_TOKENS,
80
+ do_sample=False,
81
+ )
82
+
83
+ generated_ids_trimmed = [
84
+ out_ids[len(in_ids):]
85
+ for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
86
+ ]
87
+
88
+ output_text = self.processor.batch_decode(
89
+ generated_ids_trimmed,
90
+ skip_special_tokens=True,
91
+ clean_up_tokenization_spaces=False,
92
+ )[0]
93
+
94
+ return output_text
95
+
96
+
97
+ if __name__ == "__main__":
98
+ model = ObsDrive()
99
+
100
+ question = "Please describe the driving scene."
101
+ image = "/path/to/image.png"
102
+
103
+ answer = model.chat(question, image=image)
104
+ print(answer)
105
+ ```
106
+ ## 🖼️ Multi-image Inference
107
+ ```bash
108
+ question = "Analyze the scene using all modalities."
109
+
110
+ images = [
111
+ "/path/to/camera.png",
112
+ "/path/to/lidar_bev.png",
113
+ "/path/to/radar_bev.png",
114
+ ]
115
+
116
+ answer = model.chat(question, image=images)
117
+ print(answer)
118
+ ```
119
+
120
+