ayh015 commited on
Commit
e3044d8
·
1 Parent(s): ecc5e33

update model (renamed files) and update the scripts/tools for coco's annotation

Browse files
scripts/annotate_coco.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ IDX=1,
2
+ export PYTHONPATH=$PYTHONPATH:./
3
+
4
+ data_path=../datasets/coco
5
+ model_path=./model_weights/qwen3_30b_vl_instruct/models
6
+ output_dir=outputs/coco
7
+
8
+ if [ -d ${output_dir} ];then
9
+ echo "dir already exists"
10
+ else
11
+ mkdir ${output_dir}
12
+ fi
13
+
14
+ CUDA_VISIBLE_DEVICES=$IDX OMP_NUM_THREADS=1 torchrun --nnodes=1 --nproc_per_node=1 --master_port=25006 \
15
+ tools/annotate_coco.py \
16
+ --model-path ${model_path} \
17
+ --data-path ${data_path} \
18
+ --output-dir ${output_dir} \
scripts/{annotate.sh → annotate_hico.sh} RENAMED
@@ -1,4 +1,4 @@
1
- IDX=0,4,
2
  export PYTHONPATH=$PYTHONPATH:./
3
 
4
  data_path=../datasets/HICO-Det
@@ -11,8 +11,8 @@ else
11
  mkdir ${output_dir}
12
  fi
13
 
14
- CUDA_VISIBLE_DEVICES=$IDX OMP_NUM_THREADS=1 torchrun --nnodes=1 --nproc_per_node=2 --master_port=25005 \
15
- tools/annotate.py \
16
  --model-path ${model_path} \
17
  --data-path ${data_path} \
18
  --output-dir ${output_dir} \
 
1
+ IDX=2,
2
  export PYTHONPATH=$PYTHONPATH:./
3
 
4
  data_path=../datasets/HICO-Det
 
11
  mkdir ${output_dir}
12
  fi
13
 
14
+ CUDA_VISIBLE_DEVICES=$IDX OMP_NUM_THREADS=1 torchrun --nnodes=1 --nproc_per_node=1 --master_port=25006 \
15
+ tools/annotate_hico.py \
16
  --model-path ${model_path} \
17
  --data-path ${data_path} \
18
  --output-dir ${output_dir} \
scripts/clean_initial_annotation.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ IDX=1,2
2
+ export PYTHONPATH=$PYTHONPATH:./
3
+
4
+ data_path=../datasets/HICO-Det
5
+ model_path=./model_weights/qwen3_8b_vl_instruct
6
+ output_dir=outputs
7
+
8
+ if [ -d ${output_dir} ];then
9
+ echo "dir already exists"
10
+ else
11
+ mkdir ${output_dir}
12
+ fi
13
+
14
+ CUDA_VISIBLE_DEVICES=$IDX OMP_NUM_THREADS=1 torchrun --nnodes=1 --nproc_per_node=2 --master_port=25006 \
15
+ tools/clean_initial_annotation.py \
16
+ --model-path ${model_path} \
17
+ --data-path ${data_path} \
18
+ --output-dir ${output_dir} \
tools/annotate_coco.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ import numpy as np
5
+ from tqdm import tqdm
6
+
7
+ import torch
8
+ import os, json
9
+ import torch.distributed as dist
10
+ from torch.utils.data import DataLoader
11
+ from torchvision import transforms as T
12
+
13
+ from data.pose_coco import PoseCOCODataset
14
+ from data.convsersation import Conversation_For_COCO_Long_Description
15
+
16
+ import re
17
+ from dataclasses import dataclass
18
+
19
+ from transformers import Qwen3VLForConditionalGeneration, Qwen3VLMoeForConditionalGeneration, AutoModelForCausalLM
20
+ from transformers import AutoTokenizer, AutoConfig, AutoProcessor
21
+
22
+ def disable_torch_init():
23
+ """
24
+ Disable the redundant torch default initialization to accelerate model creation.
25
+ """
26
+ setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
27
+ setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
28
+
29
+ def gather_labels_and_save(labels, output_path):
30
+ # Make sure dist is initialized (torchrun / deepspeed / accelerate usually does this)
31
+ world_size = dist.get_world_size()
32
+ rank = dist.get_rank()
33
+
34
+ gathered = [None for _ in range(world_size)]
35
+ dist.all_gather_object(gathered, labels) # gathered[i] is labels from rank i
36
+
37
+ if rank == 0:
38
+ merged = []
39
+ for part in gathered:
40
+ merged.extend(part)
41
+
42
+ with open(output_path, "w", encoding="utf-8") as f:
43
+ json.dump(merged, f, ensure_ascii=False, indent=2)
44
+
45
+ dist.barrier() # optional: ensure rank0 finished writing before others exit
46
+
47
+ @dataclass
48
+ class DataCollatorForSupervisedDataset(object):
49
+ def __init__(self, processor, data_path):
50
+ self.processor = processor
51
+ self.conv = Conversation_For_COCO_Long_Description(
52
+ system='',
53
+ data_path=data_path
54
+ )
55
+
56
+ def __call__(self, data_dicts):
57
+ """Collate examples for supervised fine-tuning."""
58
+ batch_prompts = []
59
+ batch_images = []
60
+ result_meta = []
61
+
62
+ for i, data_dict in enumerate(data_dicts):
63
+ batch_images.append(data_dict['image'])
64
+ batch_prompts.append(self.conv.get_prompt(data_dict))
65
+ result_meta.append(data_dict)
66
+
67
+ messages = []
68
+ for prompt in zip(batch_prompts):
69
+ messages.append([
70
+ {"role": "system",
71
+ "content":[
72
+ {"type": "text",
73
+ "text": self.conv.system},]},
74
+ {"role": "user",
75
+ "content":[
76
+ {"type": "image"},
77
+ {"type": "text",
78
+ "text": prompt},]},
79
+ ])
80
+
81
+ prompts = [self.processor.apply_chat_template(m,
82
+ tokenize=False,
83
+ add_generation_prompt=True)
84
+ for m in messages]
85
+ batch_tensors = self.processor(
86
+ text=prompts,
87
+ images=batch_images,
88
+ return_tensors="pt",
89
+ padding=True
90
+ )
91
+ return batch_tensors, result_meta
92
+
93
+ @torch.no_grad()
94
+ def worker(model, processor, dataset, args, output_dir):
95
+
96
+ rank = int(os.environ["LOCAL_RANK"])
97
+ world_size = int(os.environ["WORLD_SIZE"])
98
+ indices = list(range(rank, len(dataset), world_size))
99
+ print("==>" + " Worker {} Started, responsible for {} images".format(rank, len(indices)))
100
+
101
+ sub_dataset = torch.utils.data.Subset(dataset, indices)
102
+ batch_size = 1
103
+ data_loader = DataLoader(sub_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=DataCollatorForSupervisedDataset(processor, args.data_path))
104
+ labels = []
105
+
106
+ for batch_tensors, result_meta in tqdm(data_loader):
107
+
108
+ input_ids = batch_tensors['input_ids'].cuda()
109
+ batch_tensors = {k: v.cuda() for k, v in batch_tensors.items() if isinstance(v, torch.Tensor)}
110
+ with torch.inference_mode():
111
+ output_dict = model.generate(do_sample=False,
112
+ output_scores=True,
113
+ return_dict_in_generate=True,
114
+ max_new_tokens=1600,
115
+ output_logits=True,
116
+ #repetition_penalty=1.0,
117
+ no_repeat_ngram_size=4,
118
+ **batch_tensors,)
119
+ output_ids = output_dict['sequences']
120
+
121
+ for input_id, output_id, meta in zip(input_ids, output_ids, result_meta):
122
+ input_token_len = input_id.shape[0]
123
+ n_diff_input_output = (input_id != output_id[:input_token_len]).sum().item()
124
+ if n_diff_input_output > 0:
125
+ print(f'[Warning] Sample: {n_diff_input_output} output_ids are not the same as the input_ids')
126
+ output = processor.tokenizer.batch_decode(output_id[input_token_len:].unsqueeze(0), skip_special_tokens=True)[0]
127
+ labels.append({
128
+ 'file_name': meta['file_name'],
129
+ 'image_id': meta['image_id'],
130
+ 'keypoints': meta['joints'].reshape(-1).tolist(),
131
+ 'vis': meta['joints_vis'].reshape(-1).tolist(),
132
+ 'im_height': meta['image_size'][0],
133
+ 'im_width': meta['image_size'][1],
134
+ 'human_bbox': meta['human_bbox'],
135
+ 'description': output,
136
+ })
137
+
138
+ local_rank = int(os.environ.get("LOCAL_RANK", "0"))
139
+ output_path = os.path.join(args.output_dir, f'labels_{local_rank}.json')
140
+ with open(output_path, "w", encoding="utf-8") as f:
141
+ json.dump(labels, f, ensure_ascii=False, indent=2)
142
+
143
+ def eval_model(args):
144
+ torch.distributed.init_process_group(backend='nccl')
145
+ rank = int(os.environ["LOCAL_RANK"])
146
+ world_size = int(os.environ["WORLD_SIZE"])
147
+
148
+ print('Init process group: world_size: {}, rank: {}'.format(world_size, rank))
149
+ torch.cuda.set_device(rank)
150
+
151
+ disable_torch_init()
152
+ model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
153
+ args.model_path,
154
+ torch_dtype=torch.bfloat16,
155
+ trust_remote_code=True
156
+ )
157
+ model = model.cuda()
158
+ model.eval()
159
+
160
+ processor = AutoProcessor.from_pretrained(
161
+ args.model_path,
162
+ trust_remote_code=True)
163
+ processor.tokenizer.padding_side = "left"
164
+ processor.tokenizer.pad_token = processor.tokenizer.eos_token
165
+
166
+ dataset = PoseCOCODataset(
167
+ data_path=os.path.join(args.data_path, 'annotations', 'person_keypoints_train2017.json'),
168
+ multimodal_cfg=dict(image_folder=os.path.join(args.data_path, 'train2017'),
169
+ data_augmentation=False,
170
+ image_size=336,),)
171
+ worker(model, processor, dataset, args, args.output_dir)
172
+
173
+ if __name__ == "__main__":
174
+ parser = argparse.ArgumentParser()
175
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
176
+ parser.add_argument("--data-path", type=str, default="")
177
+ parser.add_argument("--output-dir", type=str, default="")
178
+ args = parser.parse_args()
179
+
180
+ eval_model(args)
181
+
tools/{annotate.py → annotate_hico.py} RENAMED
@@ -126,7 +126,6 @@ def worker(model, processor, dataset, args, output_dir):
126
  if n_diff_input_output > 0:
127
  print(f'[Warning] Sample: {n_diff_input_output} output_ids are not the same as the input_ids')
128
  output = processor.tokenizer.batch_decode(output_id[input_token_len:].unsqueeze(0), skip_special_tokens=True)[0]
129
-
130
  labels.append({
131
  'file_name': meta['file_name'],
132
  'image_id': meta['image_id'],
@@ -135,6 +134,7 @@ def worker(model, processor, dataset, args, output_dir):
135
  'vis': meta['joints_3d_vis'].reshape(-1).tolist(),
136
  'im_height': meta['hoi_obj']['height'],
137
  'im_width': meta['hoi_obj']['width'],
 
138
  'human_bbox': meta['hoi_obj']['human_bbox'],
139
  'object_bbox': meta['hoi_obj']['object_bbox'],
140
  'action_labels': meta['hoi_obj']['action_labels'],
 
126
  if n_diff_input_output > 0:
127
  print(f'[Warning] Sample: {n_diff_input_output} output_ids are not the same as the input_ids')
128
  output = processor.tokenizer.batch_decode(output_id[input_token_len:].unsqueeze(0), skip_special_tokens=True)[0]
 
129
  labels.append({
130
  'file_name': meta['file_name'],
131
  'image_id': meta['image_id'],
 
134
  'vis': meta['joints_3d_vis'].reshape(-1).tolist(),
135
  'im_height': meta['hoi_obj']['height'],
136
  'im_width': meta['hoi_obj']['width'],
137
+ 'hoi_id': meta['hoi_obj']['hoi_id'],
138
  'human_bbox': meta['hoi_obj']['human_bbox'],
139
  'object_bbox': meta['hoi_obj']['object_bbox'],
140
  'action_labels': meta['hoi_obj']['action_labels'],
tools/clean_initial_annotation.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ import numpy as np
5
+ from tqdm import tqdm
6
+
7
+ import torch
8
+ import torch.distributed as dist
9
+ from torch.utils.data import DataLoader
10
+ from torchvision import transforms as T
11
+
12
+ from data.dataset_for_clean_descrip import PoseHICODetDataset
13
+ from data.convsersation import Conversation_For_Action_Pharse as Conversation
14
+
15
+ import re
16
+ from dataclasses import dataclass
17
+
18
+ from transformers import Qwen3VLForConditionalGeneration
19
+ from transformers import AutoTokenizer, AutoConfig, AutoProcessor
20
+
21
+ def disable_torch_init():
22
+ """
23
+ Disable the redundant torch default initialization to accelerate model creation.
24
+ """
25
+ setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
26
+ setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
27
+
28
+ import os, json
29
+ import torch
30
+
31
+ @dataclass
32
+ class DataCollatorForSupervisedDataset(object):
33
+ def __init__(self, processor, data_path):
34
+ self.processor = processor
35
+ self.conv = Conversation(
36
+ system='',
37
+ data_path=data_path
38
+ )
39
+
40
+ def __call__(self, data_dicts):
41
+ """Collate examples for supervised fine-tuning."""
42
+ batch_prompts = []
43
+ batch_images = []
44
+ result_meta = []
45
+
46
+ for i, data_dict in enumerate(data_dicts):
47
+ batch_images.append(data_dict['image'])
48
+ batch_prompts.append(self.conv.get_prompt(data_dict['meta']))
49
+ result_meta.append(data_dict['meta'])
50
+
51
+ messages = []
52
+ for prompt in zip(batch_prompts):
53
+ messages.append([
54
+ {"role": "system",
55
+ "content":[
56
+ {"type": "text",
57
+ "text": self.conv.system},]},
58
+ {"role": "user",
59
+ "content":[
60
+ {"type": "image"},
61
+ {"type": "text",
62
+ "text": prompt},]},
63
+ ])
64
+
65
+ prompts = [self.processor.apply_chat_template(m,
66
+ tokenize=False,
67
+ add_generation_prompt=True)
68
+ for m in messages]
69
+ batch_tensors = self.processor(
70
+ text=prompts,
71
+ images=batch_images,
72
+ return_tensors="pt",
73
+ padding=True
74
+ )
75
+ return batch_tensors, result_meta
76
+
77
+ @torch.no_grad()
78
+ def worker(model, processor, dataset, args, output_dir):
79
+
80
+ rank = int(os.environ["LOCAL_RANK"])
81
+ world_size = int(os.environ["WORLD_SIZE"])
82
+ indices = list(range(rank, len(dataset), world_size))
83
+ print("==>" + " Worker {} Started, responsible for {} images".format(rank, len(indices)))
84
+
85
+ sub_dataset = torch.utils.data.Subset(dataset, indices)
86
+ batch_size = 16
87
+ data_loader = DataLoader(sub_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=DataCollatorForSupervisedDataset(processor, args.data_path))
88
+ labels = []
89
+
90
+ for batch_tensors, result_meta in tqdm(data_loader):
91
+
92
+ input_ids = batch_tensors['input_ids'].cuda()
93
+ batch_tensors = {k: v.cuda() for k, v in batch_tensors.items() if isinstance(v, torch.Tensor)}
94
+ with torch.inference_mode():
95
+ output_dict = model.generate(do_sample=False,
96
+ output_scores=True,
97
+ return_dict_in_generate=True,
98
+ max_new_tokens=1600,
99
+ output_logits=True,
100
+ **batch_tensors,)
101
+
102
+ output_ids = output_dict['sequences']
103
+
104
+ for input_id, output_id, meta in zip(input_ids, output_ids, result_meta):
105
+ input_token_len = input_id.shape[0]
106
+ n_diff_input_output = (input_id != output_id[:input_token_len]).sum().item()
107
+ if n_diff_input_output > 0:
108
+ print(f'[Warning] Sample: {n_diff_input_output} output_ids are not the same as the input_ids')
109
+ #input_text = processor.tokenizer.batch_decode(output_id[:input_token_len].unsqueeze(0), skip_special_tokens=True)[0]
110
+ output = processor.tokenizer.batch_decode(output_id[input_token_len:].unsqueeze(0), skip_special_tokens=True)[0]
111
+ # print(output)
112
+ # import pdb;pdb.set_trace()
113
+ meta['action_description'] = output
114
+ #import pdb;pdb.set_trace()
115
+ labels.append(meta)
116
+
117
+ local_rank = int(os.environ.get("LOCAL_RANK", "0"))
118
+ output_path = os.path.join(args.output_dir, f'labels_{local_rank}.json')
119
+ with open(output_path, "w", encoding="utf-8") as f:
120
+ json.dump(labels, f, ensure_ascii=False, indent=2)
121
+
122
+ def eval_model(args):
123
+ torch.distributed.init_process_group(backend='nccl')
124
+ rank = int(os.environ["LOCAL_RANK"])
125
+ world_size = int(os.environ["WORLD_SIZE"])
126
+
127
+ print('Init process group: world_size: {}, rank: {}'.format(world_size, rank))
128
+ torch.cuda.set_device(rank)
129
+
130
+ disable_torch_init()
131
+ model = Qwen3VLForConditionalGeneration.from_pretrained(
132
+ args.model_path,
133
+ torch_dtype=torch.bfloat16,
134
+ trust_remote_code=True
135
+ )
136
+ model = model.cuda()
137
+ model.eval()
138
+
139
+ processor = AutoProcessor.from_pretrained(
140
+ args.model_path,
141
+ trust_remote_code=True)
142
+ processor.tokenizer.padding_side = "left"
143
+ processor.tokenizer.pad_token = processor.tokenizer.eos_token
144
+
145
+ dataset = PoseHICODetDataset(
146
+ data_path=args.data_path,
147
+ multimodal_cfg=dict(image_folder=os.path.join(args.data_path, 'Images/images/train2015'),
148
+ data_augmentation=False,
149
+ image_size=336,),)
150
+ worker(model, processor, dataset, args, args.output_dir)
151
+
152
+ if __name__ == "__main__":
153
+ parser = argparse.ArgumentParser()
154
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
155
+ parser.add_argument("--data-path", type=str, default="")
156
+ parser.add_argument("--output-dir", type=str, default="")
157
+ args = parser.parse_args()
158
+
159
+ eval_model(args)
160
+