| import torch |
| import os |
| from enum import Enum |
| from tqdm import tqdm |
| import numpy as np |
| from detectron2.structures import BitMasks |
| from objectrelator.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \ |
| DEFAULT_IM_END_TOKEN, DEFAULT_SEG_TOKEN, SEG_TOKEN_INDEX |
| from objectrelator.model.builder import load_pretrained_model |
| from objectrelator.utils import disable_torch_init |
| from objectrelator.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria |
| from objectrelator.mask_config.data_args import DataArguments |
| import cv2 |
| from torch.utils.data import Dataset, DataLoader |
| from objectrelator import conversation as conversation_lib |
| from datasets.egoexo_dataset import EgoExo_Dataset_eval |
| from pycocotools.mask import encode, decode, frPyObjects |
| from detectron2.structures import BoxMode |
| from detectron2.data import MetadataCatalog, DatasetCatalog |
| from typing import Dict, Optional, Sequence, List |
| from dataclasses import dataclass, field |
| import torch.distributed as dist |
| import transformers |
| from pathlib import Path |
| from segmentation_evaluation import openseg_classes |
| COLOR_MAP = openseg_classes.ADE20K_150_CATEGORIES |
| from detectron2.data import detection_utils as utils |
| import pickle |
| import math |
| import json |
| import utils_metric |
| import os |
| import re |
| from natsort import natsorted |
| from transformers import TextStreamer |
|
|
| |
| @dataclass |
| class DataCollatorForCOCODatasetV2(object): |
| """Collate examples for supervised fine-tuning.""" |
|
|
| tokenizer: transformers.PreTrainedTokenizer |
|
|
| def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: |
| if len(instances[0]) == 0: |
| return {} |
| input_ids, labels = tuple([instance[key] for instance in instances] |
| for key in ("input_ids", "labels")) |
| input_ids = torch.nn.utils.rnn.pad_sequence( |
| input_ids, |
| batch_first=True, |
| padding_value=self.tokenizer.pad_token_id) |
| labels = torch.nn.utils.rnn.pad_sequence(labels, |
| batch_first=True, |
| padding_value=IGNORE_INDEX) |
| input_ids = input_ids[:, :self.tokenizer.model_max_length] |
| labels = labels[:, :self.tokenizer.model_max_length] |
| batch = dict( |
| input_ids=input_ids, |
| labels=labels, |
| attention_mask=input_ids.ne(self.tokenizer.pad_token_id), |
| ) |
| if 'image' in instances[0]: |
| images = [instance['image'] for instance in instances] |
| if all(x is not None and x.shape == images[0].shape for x in images): |
| batch['images'] = torch.stack(images) |
| else: |
| batch['images'] = images |
| if 'vp_image' in instances[0]: |
| vp_images = [instance['vp_image'] for instance in instances] |
| if all(x is not None and x.shape == vp_images[0].shape for x in vp_images): |
| batch['vp_images'] = torch.stack(vp_images) |
| else: |
| batch['vp_images'] = vp_images |
| for instance in instances: |
| for key in ['input_ids', 'labels', 'image']: |
| del instance[key] |
| batch['seg_info'] = [instance for instance in instances] |
|
|
| if 'dataset_type' in instances[0]: |
| batch['dataset_type'] = [instance['dataset_type'] for instance in instances] |
|
|
| if 'class_name_ids' in instances[0]: |
| class_name_ids = [instance['class_name_ids'] for instance in instances] |
| if any(x.shape != class_name_ids[0].shape for x in class_name_ids): |
| batch['class_name_ids'] = torch.nn.utils.rnn.pad_sequence( |
| class_name_ids, |
| batch_first=True, |
| padding_value=-1, |
| ) |
| else: |
| batch['class_name_ids'] = torch.stack(class_name_ids, dim=0) |
| if 'token_refer_id' in instances[0]: |
| token_refer_id = [instance['token_refer_id'] for instance in instances] |
| batch['token_refer_id'] = token_refer_id |
| if 'cls_indices' in instances[0]: |
| cls_indices = [instance['cls_indices'] for instance in instances] |
| if any(x.shape != cls_indices[0].shape for x in cls_indices): |
| batch['cls_indices'] = torch.nn.utils.rnn.pad_sequence( |
| cls_indices, |
| batch_first=True, |
| padding_value=-1, |
| ) |
| else: |
| batch['cls_indices'] = torch.stack(cls_indices, dim=0) |
| if 'random_idx' in instances[0]: |
| random_idxs = [instance['random_idx'] for instance in instances] |
| batch['random_idx'] = torch.stack(random_idxs, dim=0) |
| if 'class_name_embedding_indices' in instances[0]: |
| class_name_embedding_indices = [instance['class_name_embedding_indices'] for instance in instances] |
| class_name_embedding_indices = torch.nn.utils.rnn.pad_sequence( |
| class_name_embedding_indices, |
| batch_first=True, |
| padding_value=0) |
| batch['class_name_embedding_indices'] = class_name_embedding_indices |
| if 'refer_embedding_indices' in instances[0]: |
| refer_embedding_indices = [instance['refer_embedding_indices'] for instance in instances] |
| refer_embedding_indices = torch.nn.utils.rnn.pad_sequence( |
| refer_embedding_indices, |
| batch_first=True, |
| padding_value=0) |
| batch['refer_embedding_indices'] = refer_embedding_indices |
|
|
| return batch |
| def __str__(self): |
| fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" |
| return fmtstr.format(**self.__dict__) |
|
|
|
|
| class Summary(Enum): |
| NONE = 0 |
| AVERAGE = 1 |
| SUM = 2 |
| COUNT = 3 |
|
|
|
|
| class AverageMeter(object): |
| """Computes and stores the average and current value""" |
|
|
| def __init__(self, name, fmt=":f", summary_type=Summary.AVERAGE): |
| self.name = name |
| self.fmt = fmt |
| self.summary_type = summary_type |
| self.reset() |
|
|
| def reset(self): |
| self.val = 0 |
| self.avg = 0 |
| self.sum = 0 |
| self.count = 0 |
|
|
| def update(self, val, n=1): |
| self.val = val |
| self.sum += val * n |
| self.count += n |
| self.avg = self.sum / self.count |
|
|
| def all_reduce(self): |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| if isinstance(self.sum, np.ndarray): |
| total = torch.tensor( |
| self.sum.tolist() |
| + [ |
| self.count, |
| ], |
| dtype=torch.float32, |
| device=device, |
| ) |
| else: |
| total = torch.tensor( |
| [self.sum, self.count], dtype=torch.float32, device=device |
| ) |
|
|
| dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False) |
| if total.shape[0] > 2: |
| self.sum, self.count = total[:-1].cpu().numpy(), total[-1].cpu().item() |
| else: |
| self.sum, self.count = total.tolist() |
| self.avg = self.sum / (self.count + 1e-5) |
|
|
| def __str__(self): |
| fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" |
| return fmtstr.format(**self.__dict__) |
|
|
| def summary(self): |
| fmtstr = "" |
| if self.summary_type is Summary.NONE: |
| fmtstr = "" |
| elif self.summary_type is Summary.AVERAGE: |
| fmtstr = "{name} {avg:.3f}" |
| elif self.summary_type is Summary.SUM: |
| fmtstr = "{name} {sum:.3f}" |
| elif self.summary_type is Summary.COUNT: |
| fmtstr = "{name} {count:.3f}" |
| else: |
| raise ValueError("invalid summary type %r" % self.summary_type) |
|
|
| return fmtstr.format(**self.__dict__) |
|
|
|
|
| def intersectionAndUnionGPU(output, target, K, ignore_index=255): |
| |
| assert output.dim() in [1, 2, 3] |
| assert output.shape == target.shape |
| output = output.view(-1) |
| target = target.view(-1) |
| output[target == ignore_index] = ignore_index |
| intersection = output[output == target] |
| area_intersection = torch.histc(intersection, bins=K, min=0, max=K - 1) |
| area_output = torch.histc(output, bins=K, min=0, max=K - 1) |
| area_target = torch.histc(target, bins=K, min=0, max=K - 1) |
| area_union = area_output + area_target - area_intersection |
| return area_intersection, area_union, area_target |
|
|
| def parse_outputs(outputs,gt_mask): |
| res_list = [] |
| for output in outputs: |
| |
|
|
| pred_mask = output['instances'].pred_masks |
| pred_mask = pred_mask.cpu().numpy() |
| scores = output['instances'].scores.cpu().numpy() |
| try: |
| pred_cls = output['instances'].pred_classes.cpu().numpy() |
| except: |
| pred_cls = None |
| res = { |
| 'pred':pred_mask, |
| 'gt': gt_mask, |
| 'scores':scores, |
| 'pred_cls':pred_cls |
| } |
| res_list.append(res) |
| return res_list |
|
|
| def compute_metric(intersection_meter,union_meter,acc_iou_meter, gt_cls, results_list): |
| pred_list = [] |
| gt_list = [] |
| results_list = list(results_list) |
| for results in results_list: |
| gt = results['gt'] |
| preds = results['pred'] |
| scores = results['scores'] |
| preds = preds.astype(np.uint8) |
| |
| topk_scores,idx = torch.topk(torch.tensor(scores),1) |
| idx = idx.cpu().numpy() |
| topk_preds = preds[idx,:] |
| if results['pred_cls'] is not None: |
| topk_pred_cls = results['pred_cls'][idx] |
| max_acc_iou = -1 |
| max_iou = 0 |
| max_intersection = 0 |
| max_union = 0 |
| max_i = 0 |
| |
| for i,pred_ in enumerate(topk_preds): |
| intersection, union, _ = intersectionAndUnionGPU( |
| torch.tensor(pred_).int().cuda().contiguous().clone(), torch.tensor(gt).int().cuda().contiguous(), 2, ignore_index=255 |
| ) |
| intersection, union = intersection.cpu().numpy(), union.cpu().numpy() |
| acc_iou = intersection / (union + 1e-5) |
| acc_iou[union == 0] = 1.0 |
| fore_acc_iou = acc_iou[1] |
| if fore_acc_iou > max_acc_iou: |
| max_acc_iou = fore_acc_iou |
| max_iou = acc_iou |
| max_intersection = intersection |
| max_union = union |
| max_i = i |
| intersection_meter.update(max_intersection) |
| union_meter.update(max_union) |
| acc_iou_meter.update(max_iou, n=1) |
| pred_list.append(topk_preds[max_i]) |
| gt_list.append(gt) |
|
|
| return pred_list,gt_list |
|
|
|
|
|
|
|
|
|
|
|
|
| @dataclass |
| class DataArguments: |
| data_path: str = field(default=None, |
| metadata={"help": "Path to the training data."}) |
| lazy_preprocess: bool = False |
| is_multimodal: bool = False |
| image_folder: Optional[str] = field(default='/path/to/val2017') |
| model_path: Optional[str] = field(default="/path/to/model") |
| mask_config: Optional[str] = field(default="./objectrelator/mask_config/maskformer2_swin_base_384_bs16_50ep.yaml") |
| image_aspect_ratio: str = 'square' |
| image_grid_pinpoints: Optional[str] = field(default=None) |
| json_path: str = '/path/to/coco' |
| model_map_name: str = 'psalm' |
| version: str = 'llava_phi' |
| output_dir: str = './output/panoptic_segmentation' |
| segmentation: bool = True |
| eval_batch_size: int = 1 |
| dataloader_num_workers: int = 4 |
| seg_task: Optional[str] = field(default="referring") |
|
|
|
|
|
|
|
|
| def evaluation(): |
| parser = transformers.HfArgumentParser(DataArguments) |
| data_args = parser.parse_args_into_dataclasses()[0] |
| disable_torch_init() |
| model_path = os.path.expanduser(data_args.model_path) |
| |
| print(f'current model is {model_path}') |
| model_name = 'psalm' |
| print('Loading model:', model_name) |
| tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name, model_args=data_args, mask_config=data_args.mask_config, device='cuda') |
| print('Model loaded successfully!') |
| data_args.image_processor = image_processor |
| data_args.is_multimodal = True |
| conversation_lib.default_conversation = conversation_lib.conv_templates[data_args.version_val] |
| |
|
|
| data_args.refcoco_image_folder = data_args.image_folder |
| eval_dataset = EgoExo_Dataset_eval(json_path=data_args.json_path, tokenizer=tokenizer, data_args=data_args) |
| data_collator = DataCollatorForCOCODatasetV2(tokenizer=tokenizer) |
| dataloader_params = { |
| "batch_size": data_args.eval_batch_size, |
| "num_workers": data_args.dataloader_num_workers, |
| } |
| eval_dataloader = DataLoader(eval_dataset, batch_size=dataloader_params['batch_size'], collate_fn=data_collator, |
| num_workers=dataloader_params['num_workers']) |
|
|
| def load_ref_dataset(): |
| return RefCOCO_dataset(json_path=data_args.json_path, tokenizer=tokenizer, data_args=data_args) |
|
|
| DatasetCatalog.register('refcoco_dataset', load_ref_dataset) |
| MetadataCatalog.get('refcoco_dataset').set(stuff_classes=['object'],) |
| gt_json_path = data_args.json_path |
| with open(gt_json_path) as f: |
| gt_data = json.load(f) |
|
|
|
|
| device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
|
| model.to(device=device,dtype=torch.float).eval() |
| save_list = [] |
| intersection_meter = AverageMeter("Intersec", ":6.3f", Summary.SUM) |
| union_meter = AverageMeter("Union", ":6.3f", Summary.SUM) |
| acc_iou_meter = AverageMeter("gIoU", ":6.3f", Summary.SUM) |
|
|
| streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) |
|
|
| with torch.no_grad(): |
| for idx, inputs in tqdm(enumerate(eval_dataloader), total=len(eval_dataloader)): |
| gt = gt_data[idx]['anns'] |
| h, w = gt_data[idx]['image_info']['height'], gt_data[idx]['image_info']['width'] |
| |
| masks = [] |
| for annotation in gt: |
| if isinstance(annotation['segmentation'], list): |
| segm = np.zeros((h, w), dtype=np.uint8) |
| for poly in annotation['segmentation']: |
| poly = np.array(poly, dtype=np.int32).reshape(-1, 2) |
| cv2.fillPoly(segm, [poly], 1) |
| masks.append(segm.astype(np.bool_)) |
| else: |
| if isinstance(annotation['segmentation']['counts'], list): |
| rle = mask.frPyObjects(annotation['segmentation'], *annotation['segmentation']['size']) |
| segm = mask.decode(rle) |
| else: |
| segm = mask.decode(annotation['segmentation']) |
| masks.append(segm.astype(np.bool_)) |
| |
| gt_mask = masks[0].astype(np.uint8) |
|
|
| inputs = {k: v.to(device) if torch.is_tensor(v) else v for k, v in inputs.items()} |
| |
| inputs['token_refer_id'] = [ids.to(device) for ids in inputs['token_refer_id']] |
| |
| |
| |
| outputs = model.eval_seg( |
| input_ids=inputs['input_ids'], |
| attention_mask=inputs['attention_mask'], |
| images=inputs['images'].float(), |
| seg_info=inputs['seg_info'], |
| token_refer_id = inputs['token_refer_id'], |
| refer_embedding_indices=inputs['refer_embedding_indices'], |
| labels=inputs['labels'] |
| ) |
| output_ids = model.generate( |
| input_ids=inputs['input_ids'], |
| attention_mask=inputs['attention_mask'], |
| images=inputs['images'].float(), |
| seg_info=inputs['seg_info'], |
| token_refer_id = inputs['token_refer_id'], |
| refer_embedding_indices=inputs['refer_embedding_indices'], |
| labels=inputs['labels'], |
| do_sample=True, |
| temperature=0.2, |
| max_new_tokens=1024, |
| streamer=streamer, |
| use_cache=True, |
| ) |
|
|
| |
| input_token_len = inputs['input_ids'].shape[1] |
| generated_tokens = output_ids[:, input_token_len:] |
| generated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] |
| print("output_text:", generated_text) |
|
|
|
|
| gt_cls = inputs['seg_info'][0]['instances'].gt_classes |
| if torch.cuda.is_available(): |
| torch.cuda.synchronize() |
| cur_res = parse_outputs(outputs,gt_mask) |
| pred,gt_mask = compute_metric(intersection_meter,union_meter,acc_iou_meter, gt_cls, cur_res) |
| save_list.append({'pred':pred[0],'gt':gt_mask[0],'name':inputs['seg_info'][0]['file_name']}) |
| iou_class = intersection_meter.sum / (union_meter.sum + 1e-10) |
| ciou = iou_class[1] |
| giou = acc_iou_meter.avg[1] |
| msg = "benchmark: {}: giou: {:.4f}, ciou: {:.4f}".format(save_suffix, giou, ciou) |
| print(msg) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| if __name__ == "__main__": |
| evaluation() |