| |
| |
|
|
| |
| |
|
|
| """ |
| Misc functions, including distributed helpers. |
| |
| Mostly copy-paste from torchvision references. |
| """ |
|
|
| from dataclasses import dataclass |
| from typing import List, Optional, Tuple, Union |
|
|
| import torch |
|
|
| from PIL import Image as PILImage |
| from tensordict import tensorclass |
|
|
|
|
| @tensorclass |
| class BatchedVideoMetaData: |
| """ |
| This class represents metadata about a batch of videos. |
| Attributes: |
| unique_objects_identifier: A tensor of shape Bx3 containing unique identifiers for each object in the batch. Index consists of (video_id, obj_id, frame_id) |
| frame_orig_size: A tensor of shape Bx2 containing the original size of each frame in the batch. |
| """ |
|
|
| unique_objects_identifier: torch.LongTensor |
| frame_orig_size: torch.LongTensor |
|
|
|
|
| @tensorclass |
| class BatchedVideoDatapoint: |
| """ |
| This class represents a batch of videos with associated annotations and metadata. |
| Attributes: |
| img_batch: A [TxBxCxHxW] tensor containing the image data for each frame in the batch, where T is the number of frames per video, and B is the number of videos in the batch. |
| obj_to_frame_idx: A [TxOx2] tensor containing the image_batch index which the object belongs to. O is the number of objects in the batch. |
| masks: A [TxOxHxW] tensor containing binary masks for each object in the batch. |
| metadata: An instance of BatchedVideoMetaData containing metadata about the batch. |
| dict_key: A string key used to identify the batch. |
| """ |
|
|
| img_batch: torch.FloatTensor |
| obj_to_frame_idx: torch.IntTensor |
| masks: torch.BoolTensor |
| metadata: BatchedVideoMetaData |
|
|
| dict_key: str |
|
|
| def pin_memory(self, device=None): |
| return self.apply(torch.Tensor.pin_memory, device=device) |
|
|
| @property |
| def num_frames(self) -> int: |
| """ |
| Returns the number of frames per video. |
| """ |
| return self.batch_size[0] |
|
|
| @property |
| def num_videos(self) -> int: |
| """ |
| Returns the number of videos in the batch. |
| """ |
| return self.img_batch.shape[1] |
|
|
| @property |
| def flat_obj_to_img_idx(self) -> torch.IntTensor: |
| """ |
| Returns a flattened tensor containing the object to img index. |
| The flat index can be used to access a flattened img_batch of shape [(T*B)xCxHxW] |
| """ |
| frame_idx, video_idx = self.obj_to_frame_idx.unbind(dim=-1) |
| flat_idx = video_idx * self.num_frames + frame_idx |
| return flat_idx |
|
|
| @property |
| def flat_img_batch(self) -> torch.FloatTensor: |
| """ |
| Returns a flattened img_batch_tensor of shape [(B*T)xCxHxW] |
| """ |
|
|
| return self.img_batch.transpose(0, 1).flatten(0, 1) |
|
|
|
|
| @dataclass |
| class Object: |
| |
| object_id: int |
| |
| frame_index: int |
| segment: Union[torch.Tensor, dict] |
|
|
|
|
| @dataclass |
| class Frame: |
| data: Union[torch.Tensor, PILImage.Image] |
| objects: List[Object] |
|
|
|
|
| @dataclass |
| class VideoDatapoint: |
| """Refers to an image/video and all its annotations""" |
|
|
| frames: List[Frame] |
| video_id: int |
| size: Tuple[int, int] |
|
|
|
|
| def collate_fn( |
| batch: List[VideoDatapoint], |
| dict_key, |
| ) -> BatchedVideoDatapoint: |
| """ |
| Args: |
| batch: A list of VideoDatapoint instances. |
| dict_key (str): A string key used to identify the batch. |
| """ |
| img_batch = [] |
| for video in batch: |
| img_batch += [torch.stack([frame.data for frame in video.frames], dim=0)] |
|
|
| img_batch = torch.stack(img_batch, dim=0).permute((1, 0, 2, 3, 4)) |
| T = img_batch.shape[0] |
| |
| step_t_objects_identifier = [[] for _ in range(T)] |
| step_t_frame_orig_size = [[] for _ in range(T)] |
|
|
| step_t_masks = [[] for _ in range(T)] |
| step_t_obj_to_frame_idx = [ |
| [] for _ in range(T) |
| ] |
|
|
| for video_idx, video in enumerate(batch): |
| orig_video_id = video.video_id |
| orig_frame_size = video.size |
| for t, frame in enumerate(video.frames): |
| objects = frame.objects |
| for obj in objects: |
| orig_obj_id = obj.object_id |
| orig_frame_idx = obj.frame_index |
| step_t_obj_to_frame_idx[t].append( |
| torch.tensor([t, video_idx], dtype=torch.int) |
| ) |
| step_t_masks[t].append(obj.segment.to(torch.bool)) |
| step_t_objects_identifier[t].append( |
| torch.tensor([orig_video_id, orig_obj_id, orig_frame_idx]) |
| ) |
| step_t_frame_orig_size[t].append(torch.tensor(orig_frame_size)) |
|
|
| obj_to_frame_idx = torch.stack( |
| [ |
| torch.stack(obj_to_frame_idx, dim=0) |
| for obj_to_frame_idx in step_t_obj_to_frame_idx |
| ], |
| dim=0, |
| ) |
| masks = torch.stack([torch.stack(masks, dim=0) for masks in step_t_masks], dim=0) |
| objects_identifier = torch.stack( |
| [torch.stack(id, dim=0) for id in step_t_objects_identifier], dim=0 |
| ) |
| frame_orig_size = torch.stack( |
| [torch.stack(id, dim=0) for id in step_t_frame_orig_size], dim=0 |
| ) |
| return BatchedVideoDatapoint( |
| img_batch=img_batch, |
| obj_to_frame_idx=obj_to_frame_idx, |
| masks=masks, |
| metadata=BatchedVideoMetaData( |
| unique_objects_identifier=objects_identifier, |
| frame_orig_size=frame_orig_size, |
| ), |
| dict_key=dict_key, |
| batch_size=[T], |
| ) |
|
|