omar-ah
/

vil-tracker

Model card Files Files and versions

xet

Community

omar-ah commited on 9 days ago

Commit

9bef6c8

verified ·

1 Parent(s): be1f14e

Sequence training: pairs→K-frame clips, mLSTM memory carries across frames

Browse files

Files changed (1) hide show

vil_tracker/data/dataset.py +198 -132

vil_tracker/data/dataset.py CHANGED Viewed

@@ -245,6 +245,10 @@ def compute_crop_params(bbox: np.ndarray, context_factor: float = 2.0) -> tuple:
 class SequenceDataset(Dataset):
     """Base class for tracking sequence datasets.
     Subclasses must populate self.sequences with list of:
         {'frames': [path1, path2, ...], 'gt': [[x,y,w,h], ...]}
     """
@@ -256,6 +260,7 @@ class SequenceDataset(Dataset):
         feat_size: int = 16,
         acl_difficulty: float = 1.0,
         max_gap: int = 100,
         augmentation: bool = True,
     ):
         super().__init__()
@@ -264,6 +269,7 @@ class SequenceDataset(Dataset):
         self.feat_size = feat_size
         self.acl_difficulty = acl_difficulty
         self.max_gap = max_gap
         self.sequences = []
         self.augmentation = TrackingAugmentation() if augmentation else None
@@ -278,123 +284,168 @@ class SequenceDataset(Dataset):
             img = Image.open(path).convert('RGB')
             return np.array(img, dtype=np.float32)
         except ImportError:
-            # Fallback with OpenCV
             import cv2
             img = cv2.imread(path)
             if img is None:
                 return np.zeros((480, 640, 3), dtype=np.float32)
             return cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
-    def _sample_pair(self, idx: int) -> tuple:
-        """Sample a (template_frame_idx, search_frame_idx) pair.
-        Temporal distance controlled by ACL difficulty:
-          - difficulty=0: template and search are very close
-          - difficulty=1: template and search can be up to max_gap apart
         Returns:
-            (template_idx, search_idx) frame indices
         """
         seq = self.sequences[idx]
         n_frames = len(seq['frames'])
-        # Template: sample random frame with valid annotation
-        valid_indices = [i for i in range(n_frames) if seq['gt'][i] is not None and
-                         seq['gt'][i][2] > 0 and seq['gt'][i][3] > 0]
-        if len(valid_indices) < 2:
-            t_idx = valid_indices[0] if valid_indices else 0
-            return t_idx, t_idx
-        t_idx = random.choice(valid_indices)
-        # Search: within difficulty-scaled temporal gap
         effective_gap = max(1, int(self.max_gap * self.acl_difficulty))
-        min_idx = max(0, t_idx - effective_gap)
-        max_idx = min(n_frames - 1, t_idx + effective_gap)
-        # Only pick valid indices
-        search_candidates = [i for i in range(min_idx, max_idx + 1)
-                             if i != t_idx and i in valid_indices]
-        if not search_candidates:
-            return t_idx, t_idx
-        s_idx = random.choice(search_candidates)
-        return t_idx, s_idx
-    def __getitem__(self, idx):
-        seq = self.sequences[idx % len(self.sequences)]
-        t_idx, s_idx = self._sample_pair(idx % len(self.sequences))
-        # Load images
-        t_img = self._load_image(seq['frames'][t_idx])
-        s_img = self._load_image(seq['frames'][s_idx])
-        t_bbox = np.array(seq['gt'][t_idx], dtype=np.float32)  # [x, y, w, h]
-        s_bbox = np.array(seq['gt'][s_idx], dtype=np.float32)
-        # Crop template (centered on target, 2x context)
-        t_center, t_crop_size = compute_crop_params(t_bbox, context_factor=2.0)
-        template = crop_and_resize(t_img, t_center, t_crop_size, self.template_size)
-        # Crop search region (centered on target with jitter, 4x context)
-        s_center, s_crop_size = compute_crop_params(s_bbox, context_factor=4.0)
-        # Add spatial jitter (controlled by ACL difficulty)
-        jitter = self.acl_difficulty * s_bbox[2:4].mean() * 0.3
-        s_center[0] += random.gauss(0, jitter) if jitter > 0 else 0
-        s_center[1] += random.gauss(0, jitter) if jitter > 0 else 0
-        search = crop_and_resize(s_img, s_center, s_crop_size, self.search_size)
-        # Compute GT in search crop coordinates
-        # Target center relative to crop center, then scaled to search_size
-        scale = self.search_size / s_crop_size
-        cx_in_search = (s_bbox[0] + s_bbox[2] / 2 - s_center[0] + s_crop_size / 2) * scale
-        cy_in_search = (s_bbox[1] + s_bbox[3] / 2 - s_center[1] + s_crop_size / 2) * scale
-        w_in_search = s_bbox[2] * scale
-        h_in_search = s_bbox[3] * scale
-        # Clamp to search region
-        cx_in_search = max(0, min(self.search_size, cx_in_search))
-        cy_in_search = max(0, min(self.search_size, cy_in_search))
-        w_in_search = max(1, min(self.search_size, w_in_search))
-        h_in_search = max(1, min(self.search_size, h_in_search))
-        # Convert to tensors [0, 1]
-        template = torch.from_numpy(template).float().permute(2, 0, 1) / 255.0
-        search = torch.from_numpy(search).float().permute(2, 0, 1) / 255.0
-        bbox_tensor = torch.tensor([cx_in_search, cy_in_search, w_in_search, h_in_search])
-        # Apply augmentations
-        if self.augmentation is not None:
-            template, search, bbox_tensor = self.augmentation(template, search, bbox_tensor)
-        # Generate GT heatmap
         stride = self.search_size / self.feat_size
-        cx_feat = bbox_tensor[0].item() / stride
-        cy_feat = bbox_tensor[1].item() / stride
         y = torch.arange(self.feat_size, dtype=torch.float32)
         x = torch.arange(self.feat_size, dtype=torch.float32)
         yy, xx = torch.meshgrid(y, x, indexing='ij')
-        # Adaptive sigma based on target size (smaller targets = sharper heatmap)
-        sigma = max(1.0, min(3.0, (w_in_search + h_in_search) / (2 * stride * 4)))
         dist_sq = (xx - cx_feat) ** 2 + (yy - cy_feat) ** 2
         heatmap = torch.exp(-dist_sq / (2 * sigma ** 2)).unsqueeze(0)
-        # Normalized size
-        size = torch.tensor([bbox_tensor[2].item() / self.search_size,
-                             bbox_tensor[3].item() / self.search_size])
         return {
-            'template': template,
-            'search': search,
-            'heatmap': heatmap,
-            'size': size,
-            'boxes': bbox_tensor,
         }
     def set_acl_difficulty(self, difficulty: float):
@@ -686,8 +737,8 @@ class COCODetDataset(SequenceDataset):
 class SyntheticTrackingDataset(Dataset):
     """Synthetic tracking dataset for testing without real data.
-    Generates colored rectangles on noise backgrounds with controlled
-    position jitter based on ACL difficulty.
     """
     def __init__(
@@ -697,6 +748,7 @@ class SyntheticTrackingDataset(Dataset):
         search_size: int = 256,
         feat_size: int = 16,
         acl_difficulty: float = 1.0,
     ):
         super().__init__()
         self.length = length
@@ -704,64 +756,78 @@ class SyntheticTrackingDataset(Dataset):
         self.search_size = search_size
         self.feat_size = feat_size
         self.acl_difficulty = acl_difficulty
     def __len__(self):
         return self.length
     def __getitem__(self, idx):
         rng = random.Random(idx)
-        # Random target size (relative to search region)
         target_w = rng.uniform(0.1, 0.5) * self.search_size
         target_h = rng.uniform(0.1, 0.5) * self.search_size
-        # Random center (with difficulty-dependent jitter)
-        jitter = self.acl_difficulty * 0.3
-        cx = self.search_size / 2 + rng.gauss(0, jitter * self.search_size)
-        cy = self.search_size / 2 + rng.gauss(0, jitter * self.search_size)
-        cx = max(target_w / 2, min(self.search_size - target_w / 2, cx))
-        cy = max(target_h / 2, min(self.search_size - target_h / 2, cy))
-        # Create synthetic images
-        template = torch.randn(3, self.template_size, self.template_size) * 0.1
-        search = torch.randn(3, self.search_size, self.search_size) * 0.1
-        # Draw target in template (centered)
-        t_half_w = int(min(target_w / 2, self.template_size / 2 - 1))
-        t_half_h = int(min(target_h / 2, self.template_size / 2 - 1))
         tc = self.template_size // 2
-        color = torch.tensor([rng.random(), rng.random(), rng.random()]).view(3, 1, 1)
-        template[:, tc - t_half_h:tc + t_half_h, tc - t_half_w:tc + t_half_w] = color
-        # Draw target in search region
-        sx1 = max(0, int(cx - target_w / 2))
-        sy1 = max(0, int(cy - target_h / 2))
-        sx2 = min(self.search_size, int(cx + target_w / 2))
-        sy2 = min(self.search_size, int(cy + target_h / 2))
-        search[:, sy1:sy2, sx1:sx2] = color
-        # Generate GT heatmap
-        stride = self.search_size / self.feat_size
-        cx_feat = cx / stride
-        cy_feat = cy / stride
-        y = torch.arange(self.feat_size, dtype=torch.float32)
-        x = torch.arange(self.feat_size, dtype=torch.float32)
-        yy, xx = torch.meshgrid(y, x, indexing='ij')
-        sigma = 2.0
-        dist_sq = (xx - cx_feat) ** 2 + (yy - cy_feat) ** 2
-        heatmap = torch.exp(-dist_sq / (2 * sigma ** 2)).unsqueeze(0)
-        size = torch.tensor([target_w / self.search_size, target_h / self.search_size])
-        boxes = torch.tensor([cx, cy, target_w, target_h])
         return {
-            'template': template,
-            'search': search,
-            'heatmap': heatmap,
-            'size': size,
-            'boxes': boxes,
         }
     def set_acl_difficulty(self, difficulty: float):
@@ -1202,5 +1268,5 @@ def build_tracking_dataset(
 class TrackingDataset(SyntheticTrackingDataset):
     """Backward-compatible alias for SyntheticTrackingDataset."""
     def __init__(self, data_dir=None, split='train', synthetic=False,
-                 synthetic_length=10000, **kwargs):
-        super().__init__(length=synthetic_length, **kwargs)

 class SequenceDataset(Dataset):
     """Base class for tracking sequence datasets.
+    Returns K-frame clips: template + K consecutive search frames.
+    The mLSTM processes these as one long sequence where memory carries
+    information across frames — this is the core training paradigm.
     Subclasses must populate self.sequences with list of:
         {'frames': [path1, path2, ...], 'gt': [[x,y,w,h], ...]}
     """
         feat_size: int = 16,
         acl_difficulty: float = 1.0,
         max_gap: int = 100,
+        clip_length: int = 3,
         augmentation: bool = True,
     ):
         super().__init__()
         self.feat_size = feat_size
         self.acl_difficulty = acl_difficulty
         self.max_gap = max_gap
+        self.clip_length = clip_length  # K search frames per sample
         self.sequences = []
         self.augmentation = TrackingAugmentation() if augmentation else None
             img = Image.open(path).convert('RGB')
             return np.array(img, dtype=np.float32)
         except ImportError:
             import cv2
             img = cv2.imread(path)
             if img is None:
                 return np.zeros((480, 640, 3), dtype=np.float32)
             return cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
+    def _sample_clip(self, idx: int) -> list:
+        """Sample a clip: template frame + K consecutive search frames.
         Returns:
+            list of frame indices: [template_idx, search_1_idx, ..., search_K_idx]
         """
         seq = self.sequences[idx]
         n_frames = len(seq['frames'])
+        K = self.clip_length
+        valid = [i for i in range(n_frames)
+                 if seq['gt'][i] is not None and seq['gt'][i][2] > 0 and seq['gt'][i][3] > 0]
+        valid_set = set(valid)
+        if len(valid) < K + 1:
+            # Not enough frames — repeat what we have
+            if len(valid) == 0:
+                return [0] * (K + 1)
+            return [valid[0]] + [valid[min(i, len(valid)-1)] for i in range(K)]
+        # Template: pick a random valid frame
+        t_idx = random.choice(valid)
+        # Search frames: K consecutive valid frames AFTER template
+        # Temporal gap between template and first search controlled by ACL
         effective_gap = max(1, int(self.max_gap * self.acl_difficulty))
+        # Find the start of the search clip: somewhere after template
+        min_start = t_idx + 1
+        max_start = min(t_idx + effective_gap, n_frames - K)
+        if max_start < min_start:
+            # Try before template
+            max_start_before = t_idx - K
+            min_start_before = max(0, t_idx - effective_gap - K)
+            if max_start_before >= min_start_before and max_start_before >= 0:
+                clip_start = random.randint(min_start_before, max_start_before)
+            else:
+                # Fallback: just use whatever consecutive frames we can find
+                clip_start = max(0, min(n_frames - K, t_idx + 1))
+            # But ensure template is different from search frames
+        else:
+            clip_start = random.randint(min_start, max(min_start, max_start))
+        # Collect K consecutive frames, preferring valid ones
+        search_indices = []
+        for i in range(clip_start, min(clip_start + K * 3, n_frames)):
+            if i in valid_set and i != t_idx:
+                search_indices.append(i)
+            if len(search_indices) == K:
+                break
+        # Pad if we didn't find enough
+        while len(search_indices) < K:
+            search_indices.append(search_indices[-1] if search_indices else t_idx)
+        return [t_idx] + search_indices[:K]
+    def _process_frame(self, img: np.ndarray, bbox: np.ndarray, is_template: bool):
+        """Crop and preprocess a single frame.
+        Returns:
+            image_tensor: (3, H, W) float [0, 1]
+            bbox_in_crop: (4,) [cx, cy, w, h] in crop coordinates
+        """
+        if is_template:
+            center, crop_size = compute_crop_params(bbox, context_factor=2.0)
+            output_size = self.template_size
+        else:
+            center, crop_size = compute_crop_params(bbox, context_factor=4.0)
+            output_size = self.search_size
+            # Spatial jitter for search (controlled by ACL)
+            jitter = self.acl_difficulty * bbox[2:4].mean() * 0.3
+            if jitter > 0:
+                center[0] += random.gauss(0, jitter)
+                center[1] += random.gauss(0, jitter)
+        crop = crop_and_resize(img, center, crop_size, output_size)
+        # Compute GT in crop coordinates
+        scale = output_size / crop_size
+        cx = (bbox[0] + bbox[2] / 2 - center[0] + crop_size / 2) * scale
+        cy = (bbox[1] + bbox[3] / 2 - center[1] + crop_size / 2) * scale
+        w = bbox[2] * scale
+        h = bbox[3] * scale
+        cx = max(0, min(output_size, cx))
+        cy = max(0, min(output_size, cy))
+        w = max(1, min(output_size, w))
+        h = max(1, min(output_size, h))
+        tensor = torch.from_numpy(crop).float().permute(2, 0, 1) / 255.0
+        bbox_crop = torch.tensor([cx, cy, w, h])
+        return tensor, bbox_crop
+    def _make_heatmap(self, bbox: torch.Tensor):
+        """Generate GT heatmap from bbox in search crop coordinates."""
         stride = self.search_size / self.feat_size
+        cx_feat = bbox[0].item() / stride
+        cy_feat = bbox[1].item() / stride
+        w_search = bbox[2].item()
+        h_search = bbox[3].item()
         y = torch.arange(self.feat_size, dtype=torch.float32)
         x = torch.arange(self.feat_size, dtype=torch.float32)
         yy, xx = torch.meshgrid(y, x, indexing='ij')
+        sigma = max(1.0, min(3.0, (w_search + h_search) / (2 * stride * 4)))
         dist_sq = (xx - cx_feat) ** 2 + (yy - cy_feat) ** 2
         heatmap = torch.exp(-dist_sq / (2 * sigma ** 2)).unsqueeze(0)
+        return heatmap
+    def __getitem__(self, idx):
+        seq = self.sequences[idx % len(self.sequences)]
+        clip_indices = self._sample_clip(idx % len(self.sequences))
+        t_idx = clip_indices[0]
+        s_indices = clip_indices[1:]
+        K = len(s_indices)
+        # Load and process template
+        t_img = self._load_image(seq['frames'][t_idx])
+        t_bbox = np.array(seq['gt'][t_idx], dtype=np.float32)
+        template, _ = self._process_frame(t_img, t_bbox, is_template=True)
+        # Load and process K search frames
+        searches = []
+        heatmaps = []
+        sizes = []
+        boxes = []
+        for s_idx in s_indices:
+            s_img = self._load_image(seq['frames'][s_idx])
+            s_bbox = np.array(seq['gt'][s_idx], dtype=np.float32)
+            search, bbox_crop = self._process_frame(s_img, s_bbox, is_template=False)
+            # Apply augmentation (same color transform for template+search consistency)
+            if self.augmentation is not None:
+                template_aug, search, bbox_crop = self.augmentation(template, search, bbox_crop)
+                # Only use augmented template from first search frame to keep consistency
+                if len(searches) == 0:
+                    template = template_aug
+            searches.append(search)
+            heatmaps.append(self._make_heatmap(bbox_crop))
+            sizes.append(torch.tensor([bbox_crop[2].item() / self.search_size,
+                                       bbox_crop[3].item() / self.search_size]))
+            boxes.append(bbox_crop)
         return {
+            'template': template,                          # (3, 128, 128)
+            'searches': torch.stack(searches, dim=0),      # (K, 3, 256, 256)
+            'heatmaps': torch.stack(heatmaps, dim=0),      # (K, 1, 16, 16)
+            'sizes': torch.stack(sizes, dim=0),             # (K, 2)
+            'boxes': torch.stack(boxes, dim=0),             # (K, 4)
         }
     def set_acl_difficulty(self, difficulty: float):
 class SyntheticTrackingDataset(Dataset):
     """Synthetic tracking dataset for testing without real data.
+    Generates K-frame clips: template + K search frames with a moving
+    colored rectangle target. Motion is linear with noise.
     """
     def __init__(
         search_size: int = 256,
         feat_size: int = 16,
         acl_difficulty: float = 1.0,
+        clip_length: int = 3,
     ):
         super().__init__()
         self.length = length
         self.search_size = search_size
         self.feat_size = feat_size
         self.acl_difficulty = acl_difficulty
+        self.clip_length = clip_length
     def __len__(self):
         return self.length
+    def _make_heatmap(self, cx, cy, w_search, h_search):
+        stride = self.search_size / self.feat_size
+        cx_feat = cx / stride
+        cy_feat = cy / stride
+        y = torch.arange(self.feat_size, dtype=torch.float32)
+        x = torch.arange(self.feat_size, dtype=torch.float32)
+        yy, xx = torch.meshgrid(y, x, indexing='ij')
+        sigma = max(1.0, min(3.0, (w_search + h_search) / (2 * stride * 4)))
+        dist_sq = (xx - cx_feat) ** 2 + (yy - cy_feat) ** 2
+        return torch.exp(-dist_sq / (2 * sigma ** 2)).unsqueeze(0)
     def __getitem__(self, idx):
         rng = random.Random(idx)
+        K = self.clip_length
+        # Target appearance
+        color = torch.tensor([rng.random(), rng.random(), rng.random()]).view(3, 1, 1)
         target_w = rng.uniform(0.1, 0.5) * self.search_size
         target_h = rng.uniform(0.1, 0.5) * self.search_size
+        # Initial position (center of search)
+        cx0 = self.search_size / 2
+        cy0 = self.search_size / 2
+        # Velocity (pixels per frame, scaled by difficulty)
+        vx = rng.gauss(0, self.acl_difficulty * 15)
+        vy = rng.gauss(0, self.acl_difficulty * 15)
+        # Template: target at center
+        template = torch.randn(3, self.template_size, self.template_size) * 0.1
+        t_hw = int(min(target_w / 2, self.template_size / 2 - 1))
+        t_hh = int(min(target_h / 2, self.template_size / 2 - 1))
         tc = self.template_size // 2
+        template[:, tc - t_hh:tc + t_hh, tc - t_hw:tc + t_hw] = color
+        # K search frames with moving target
+        searches = []
+        heatmaps = []
+        sizes = []
+        boxes = []
+        for k in range(K):
+            # Position at frame k
+            cx = cx0 + vx * (k + 1) + rng.gauss(0, self.acl_difficulty * 5)
+            cy = cy0 + vy * (k + 1) + rng.gauss(0, self.acl_difficulty * 5)
+            cx = max(target_w / 2, min(self.search_size - target_w / 2, cx))
+            cy = max(target_h / 2, min(self.search_size - target_h / 2, cy))
+            search = torch.randn(3, self.search_size, self.search_size) * 0.1
+            sx1 = max(0, int(cx - target_w / 2))
+            sy1 = max(0, int(cy - target_h / 2))
+            sx2 = min(self.search_size, int(cx + target_w / 2))
+            sy2 = min(self.search_size, int(cy + target_h / 2))
+            search[:, sy1:sy2, sx1:sx2] = color
+            searches.append(search)
+            heatmaps.append(self._make_heatmap(cx, cy, target_w, target_h))
+            sizes.append(torch.tensor([target_w / self.search_size,
+                                       target_h / self.search_size]))
+            boxes.append(torch.tensor([cx, cy, target_w, target_h]))
         return {
+            'template': template,                          # (3, 128, 128)
+            'searches': torch.stack(searches, dim=0),      # (K, 3, 256, 256)
+            'heatmaps': torch.stack(heatmaps, dim=0),      # (K, 1, 16, 16)
+            'sizes': torch.stack(sizes, dim=0),             # (K, 2)
+            'boxes': torch.stack(boxes, dim=0),             # (K, 4)
         }
     def set_acl_difficulty(self, difficulty: float):
 class TrackingDataset(SyntheticTrackingDataset):
     """Backward-compatible alias for SyntheticTrackingDataset."""
     def __init__(self, data_dir=None, split='train', synthetic=False,
+                 synthetic_length=10000, clip_length=3, **kwargs):
+        super().__init__(length=synthetic_length, clip_length=clip_length, **kwargs)