omar-ah
/

vil-tracker

Model card Files Files and versions

xet

Community

omar-ah commited on 9 days ago

Commit

69641c6

verified ·

1 Parent(s): 8237685

Fix vil_tracker/data/dataset.py: audit corrections

Browse files

Files changed (1) hide show

vil_tracker/data/dataset.py +769 -60

vil_tracker/data/dataset.py CHANGED Viewed

@@ -1,80 +1,714 @@
 """
-Tracking dataset with synthetic fallback for testing.
 Supports:
-- GOT-10k, LaSOT, TrackingNet, COCO formats
 - Synthetic data generation for testing (no external data needed)
 - ACL (Adaptive Curriculum Learning) difficulty scaling
-- Standard tracking augmentations: jitter, flip, color aug
 """
 import os
 import math
 import random
 import torch
 import numpy as np
-from torch.utils.data import Dataset
-class TrackingDataset(Dataset):
-    """Tracking dataset for ViL Tracker training.
-    Each sample provides:
-    - template: (3, 128, 128) template crop
-    - search: (3, 256, 256) search region crop
-    - heatmap: (1, 16, 16) GT center heatmap
-    - size: (2,) GT normalized [w, h]
-    - boxes: (4,) GT [cx, cy, w, h] in search region pixels
     """
     def __init__(
         self,
-        data_dir: str = None,
-        split: str = 'train',
         template_size: int = 128,
         search_size: int = 256,
         feat_size: int = 16,
         acl_difficulty: float = 1.0,
-        synthetic: bool = False,
-        synthetic_length: int = 10000,
     ):
         super().__init__()
         self.template_size = template_size
         self.search_size = search_size
         self.feat_size = feat_size
         self.acl_difficulty = acl_difficulty
-        self.synthetic = synthetic
-        self.synthetic_length = synthetic_length
-        if synthetic:
-            self.samples = list(range(synthetic_length))
-        else:
-            self.samples = self._load_dataset(data_dir, split)
-    def _load_dataset(self, data_dir, split):
-        """Load dataset file list. Returns list of sample dicts."""
-        samples = []
-        if data_dir and os.path.exists(data_dir):
-            # Load real dataset
-            ann_file = os.path.join(data_dir, f'{split}.json')
-            if os.path.exists(ann_file):
-                import json
-                with open(ann_file, 'r') as f:
-                    samples = json.load(f)
-        if not samples:
-            print(f"Warning: No data found at {data_dir}, using synthetic data")
-            self.synthetic = True
-            self.synthetic_length = 10000
-            return list(range(self.synthetic_length))
-        return samples
     def __len__(self):
-        return len(self.samples) if not self.synthetic else self.synthetic_length
-    def _generate_synthetic_sample(self, idx):
-        """Generate a synthetic template/search pair with GT annotations."""
         rng = random.Random(idx)
         # Random target size (relative to search region)
@@ -88,7 +722,7 @@ class TrackingDataset(Dataset):
         cx = max(target_w / 2, min(self.search_size - target_w / 2, cx))
         cy = max(target_h / 2, min(self.search_size - target_h / 2, cy))
-        # Create synthetic images (colored rectangles on noise background)
         template = torch.randn(3, self.template_size, self.template_size) * 0.1
         search = torch.randn(3, self.search_size, self.search_size) * 0.1
@@ -97,7 +731,7 @@ class TrackingDataset(Dataset):
         t_half_h = int(min(target_h / 2, self.template_size / 2 - 1))
         tc = self.template_size // 2
         color = torch.tensor([rng.random(), rng.random(), rng.random()]).view(3, 1, 1)
-        template[:, tc-t_half_h:tc+t_half_h, tc-t_half_w:tc+t_half_w] = color
         # Draw target in search region
         sx1 = max(0, int(cx - target_w / 2))
@@ -119,10 +753,7 @@ class TrackingDataset(Dataset):
         dist_sq = (xx - cx_feat) ** 2 + (yy - cy_feat) ** 2
         heatmap = torch.exp(-dist_sq / (2 * sigma ** 2)).unsqueeze(0)
-        # Normalized size
         size = torch.tensor([target_w / self.search_size, target_h / self.search_size])
-        # Box in pixels
         boxes = torch.tensor([cx, cy, target_w, target_h])
         return {
@@ -133,15 +764,93 @@ class TrackingDataset(Dataset):
             'boxes': boxes,
         }
-    def __getitem__(self, idx):
-        if self.synthetic:
-            return self._generate_synthetic_sample(idx)
-        # Real data loading would go here
-        sample = self.samples[idx]
-        # ... load images, compute crops, generate targets
-        return self._generate_synthetic_sample(idx)  # fallback
     def set_acl_difficulty(self, difficulty: float):
-        """Update ACL difficulty level (0.0 = easy, 1.0 = hard)."""
         self.acl_difficulty = min(1.0, max(0.0, difficulty))

 """
+Tracking dataset with real dataset loaders and synthetic fallback.
 Supports:
+- GOT-10k: train split (~10k sequences, annotations in groundtruth.txt)
+- LaSOT: training split (1120 sequences, 14 categories)
+- TrackingNet: training split (30k+ sequences, annotations in anno/)
+- COCO detection: for static pair pretraining (bbox crops as pseudo-sequences)
 - Synthetic data generation for testing (no external data needed)
 - ACL (Adaptive Curriculum Learning) difficulty scaling
+- Standard tracking augmentations: spatial jitter, horizontal flip, color jitter,
+  grayscale, Gaussian blur, brightness/contrast
+Each sample produces a (template, search) pair from the same video sequence
+with controlled temporal distance, plus GT annotations.
+Dataset directory structure expected:
+    GOT-10k/
+        train/
+            GOT-10k_Train_000001/
+                00000001.jpg, 00000002.jpg, ...
+                groundtruth.txt          # x,y,w,h per line
+            ...
+    LaSOT/
+        airplane/
+            airplane-1/
+                img/
+                    00000001.jpg, ...
+                groundtruth.txt          # x,y,w,h per line
+            ...
+    TrackingNet/
+        TRAIN_0/
+            frames/
+                video_name/
+                    0.jpg, 1.jpg, ...
+            anno/
+                video_name.txt           # x,y,w,h per line
+        ...
+    COCO/
+        train2017/
+            *.jpg
+        annotations/
+            instances_train2017.json
 """
 import os
 import math
+import glob
 import random
 import torch
 import numpy as np
+from pathlib import Path
+from torch.utils.data import Dataset, ConcatDataset
+# ============================================================
+# Augmentations (no torchvision dependency, works with tensors)
+# ============================================================
+class TrackingAugmentation:
+    """Standard tracking augmentations applied to (template, search) pairs.
+    Augmentations preserve the spatial relationship between search region
+    and GT bounding box by applying augmentations consistently.
+    """
+    def __init__(
+        self,
+        brightness: float = 0.2,
+        contrast: float = 0.2,
+        saturation: float = 0.2,
+        grayscale_prob: float = 0.05,
+        horizontal_flip_prob: float = 0.5,
+        blur_prob: float = 0.1,
+        blur_sigma: tuple = (0.1, 2.0),
+    ):
+        self.brightness = brightness
+        self.contrast = contrast
+        self.saturation = saturation
+        self.grayscale_prob = grayscale_prob
+        self.horizontal_flip_prob = horizontal_flip_prob
+        self.blur_prob = blur_prob
+        self.blur_sigma = blur_sigma
+    def __call__(self, template: torch.Tensor, search: torch.Tensor,
+                 bbox: torch.Tensor) -> tuple:
+        """
+        Args:
+            template: (3, H_t, W_t) tensor in [0, 1]
+            search: (3, H_s, W_s) tensor in [0, 1]
+            bbox: (4,) tensor [cx, cy, w, h] in search region pixels
+        Returns:
+            template, search, bbox (augmented)
+        """
+        # Color jitter (same for template and search to maintain appearance consistency)
+        if random.random() < 0.8:
+            # Brightness
+            factor = 1.0 + random.uniform(-self.brightness, self.brightness)
+            template = (template * factor).clamp(0, 1)
+            search = (search * factor).clamp(0, 1)
+            # Contrast
+            factor = 1.0 + random.uniform(-self.contrast, self.contrast)
+            t_mean = template.mean()
+            s_mean = search.mean()
+            template = ((template - t_mean) * factor + t_mean).clamp(0, 1)
+            search = ((search - s_mean) * factor + s_mean).clamp(0, 1)
+        # Grayscale
+        if random.random() < self.grayscale_prob:
+            t_gray = template.mean(dim=0, keepdim=True).expand_as(template)
+            s_gray = search.mean(dim=0, keepdim=True).expand_as(search)
+            template = t_gray
+            search = s_gray
+        # Horizontal flip (must also flip bbox cx)
+        if random.random() < self.horizontal_flip_prob:
+            template = template.flip(-1)
+            search = search.flip(-1)
+            W_s = search.shape[-1]
+            bbox = bbox.clone()
+            bbox[0] = W_s - bbox[0]  # flip cx
+        # Gaussian blur (search only — simulates motion blur)
+        if random.random() < self.blur_prob:
+            sigma = random.uniform(*self.blur_sigma)
+            kernel_size = int(2 * round(3 * sigma) + 1)
+            if kernel_size >= 3:
+                search = self._gaussian_blur(search, kernel_size, sigma)
+        return template, search, bbox
+    @staticmethod
+    def _gaussian_blur(img: torch.Tensor, kernel_size: int, sigma: float) -> torch.Tensor:
+        """Apply Gaussian blur to a (C, H, W) tensor."""
+        import torch.nn.functional as F
+        # Create 1D Gaussian kernel
+        x = torch.arange(kernel_size, dtype=img.dtype, device=img.device) - kernel_size // 2
+        kernel_1d = torch.exp(-0.5 * (x / sigma) ** 2)
+        kernel_1d = kernel_1d / kernel_1d.sum()
+        # Apply separable 2D blur
+        pad = kernel_size // 2
+        img = img.unsqueeze(0)  # (1, C, H, W)
+        # Horizontal
+        k_h = kernel_1d.view(1, 1, 1, -1).expand(img.shape[1], -1, -1, -1)
+        img = F.conv2d(F.pad(img, (pad, pad, 0, 0), mode='reflect'),
+                       k_h, groups=img.shape[1])
+        # Vertical
+        k_v = kernel_1d.view(1, 1, -1, 1).expand(img.shape[1], -1, -1, -1)
+        img = F.conv2d(F.pad(img, (0, 0, pad, pad), mode='reflect'),
+                       k_v, groups=img.shape[1])
+        return img.squeeze(0)
+# ============================================================
+# Crop utilities
+# ============================================================
+def crop_and_resize(image: np.ndarray, center: np.ndarray, size: float,
+                    output_size: int) -> np.ndarray:
+    """Crop a square region from image, centered at center, with given size.
+    Args:
+        image: (H, W, 3) numpy array, uint8 or float
+        center: (2,) [cx, cy] in image coordinates
+        size: side length of the square crop
+        output_size: resize crop to (output_size, output_size)
+    Returns:
+        (output_size, output_size, 3) numpy array
+    """
+    H, W = image.shape[:2]
+    half = size / 2
+    x1 = int(round(center[0] - half))
+    y1 = int(round(center[1] - half))
+    x2 = int(round(center[0] + half))
+    y2 = int(round(center[1] + half))
+    # Boundary padding
+    pad_left = max(0, -x1)
+    pad_top = max(0, -y1)
+    pad_right = max(0, x2 - W)
+    pad_bottom = max(0, y2 - H)
+    x1c = max(0, x1)
+    y1c = max(0, y1)
+    x2c = min(W, x2)
+    y2c = min(H, y2)
+    crop = image[y1c:y2c, x1c:x2c]
+    if pad_left > 0 or pad_top > 0 or pad_right > 0 or pad_bottom > 0:
+        mean_color = image.mean(axis=(0, 1))
+        padded = np.full((crop.shape[0] + pad_top + pad_bottom,
+                          crop.shape[1] + pad_left + pad_right, 3),
+                         mean_color, dtype=crop.dtype)
+        padded[pad_top:pad_top + crop.shape[0], pad_left:pad_left + crop.shape[1]] = crop
+        crop = padded
+    # Resize
+    if crop.shape[0] > 0 and crop.shape[1] > 0:
+        import torch.nn.functional as F
+        crop_t = torch.from_numpy(crop.copy()).float().permute(2, 0, 1).unsqueeze(0)
+        crop_t = F.interpolate(crop_t, size=(output_size, output_size),
+                               mode='bilinear', align_corners=False)
+        crop = crop_t.squeeze(0).permute(1, 2, 0).numpy()
+    else:
+        crop = np.zeros((output_size, output_size, 3), dtype=np.float32)
+    return crop
+def compute_crop_params(bbox: np.ndarray, context_factor: float = 2.0) -> tuple:
+    """Compute crop center and size from bbox with context.
+    Args:
+        bbox: [x, y, w, h] bounding box
+        context_factor: how much context around bbox (2.0 = 2x target size)
+    Returns:
+        center: (2,) [cx, cy]
+        crop_size: scalar side length
+    """
+    x, y, w, h = bbox
+    cx = x + w / 2
+    cy = y + h / 2
+    # Context amount following STARK/OSTrack convention:
+    # s = sqrt((w + 2p) * (h + 2p)), where p = (w + h) / 2
+    p = (w + h) / 2
+    crop_size = math.sqrt((w + p) * (h + p)) * context_factor
+    crop_size = max(crop_size, 10)
+    return np.array([cx, cy]), crop_size
+# ============================================================
+# Base sequence dataset
+# ============================================================
+class SequenceDataset(Dataset):
+    """Base class for tracking sequence datasets.
+    Subclasses must populate self.sequences with list of:
+        {'frames': [path1, path2, ...], 'gt': [[x,y,w,h], ...]}
     """
     def __init__(
         self,
         template_size: int = 128,
         search_size: int = 256,
         feat_size: int = 16,
         acl_difficulty: float = 1.0,
+        max_gap: int = 100,
+        augmentation: bool = True,
     ):
         super().__init__()
         self.template_size = template_size
         self.search_size = search_size
         self.feat_size = feat_size
         self.acl_difficulty = acl_difficulty
+        self.max_gap = max_gap
+        self.sequences = []
+        self.augmentation = TrackingAugmentation() if augmentation else None
     def __len__(self):
+        return len(self.sequences)
+    def _load_image(self, path: str) -> np.ndarray:
+        """Load image from path. Returns (H, W, 3) float32 in [0, 255]."""
+        try:
+            from PIL import Image
+            img = Image.open(path).convert('RGB')
+            return np.array(img, dtype=np.float32)
+        except ImportError:
+            # Fallback with OpenCV
+            import cv2
+            img = cv2.imread(path)
+            if img is None:
+                return np.zeros((480, 640, 3), dtype=np.float32)
+            return cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
+    def _sample_pair(self, idx: int) -> tuple:
+        """Sample a (template_frame_idx, search_frame_idx) pair.
+        Temporal distance controlled by ACL difficulty:
+          - difficulty=0: template and search are very close
+          - difficulty=1: template and search can be up to max_gap apart
+        Returns:
+            (template_idx, search_idx) frame indices
+        """
+        seq = self.sequences[idx]
+        n_frames = len(seq['frames'])
+        # Template: sample random frame with valid annotation
+        valid_indices = [i for i in range(n_frames) if seq['gt'][i] is not None and
+                         seq['gt'][i][2] > 0 and seq['gt'][i][3] > 0]
+        if len(valid_indices) < 2:
+            t_idx = valid_indices[0] if valid_indices else 0
+            return t_idx, t_idx
+        t_idx = random.choice(valid_indices)
+        # Search: within difficulty-scaled temporal gap
+        effective_gap = max(1, int(self.max_gap * self.acl_difficulty))
+        min_idx = max(0, t_idx - effective_gap)
+        max_idx = min(n_frames - 1, t_idx + effective_gap)
+        # Only pick valid indices
+        search_candidates = [i for i in range(min_idx, max_idx + 1)
+                             if i != t_idx and i in valid_indices]
+        if not search_candidates:
+            return t_idx, t_idx
+        s_idx = random.choice(search_candidates)
+        return t_idx, s_idx
+    def __getitem__(self, idx):
+        seq = self.sequences[idx % len(self.sequences)]
+        t_idx, s_idx = self._sample_pair(idx % len(self.sequences))
+        # Load images
+        t_img = self._load_image(seq['frames'][t_idx])
+        s_img = self._load_image(seq['frames'][s_idx])
+        t_bbox = np.array(seq['gt'][t_idx], dtype=np.float32)  # [x, y, w, h]
+        s_bbox = np.array(seq['gt'][s_idx], dtype=np.float32)
+        # Crop template (centered on target, 2x context)
+        t_center, t_crop_size = compute_crop_params(t_bbox, context_factor=2.0)
+        template = crop_and_resize(t_img, t_center, t_crop_size, self.template_size)
+        # Crop search region (centered on target with jitter, 4x context)
+        s_center, s_crop_size = compute_crop_params(s_bbox, context_factor=4.0)
+        # Add spatial jitter (controlled by ACL difficulty)
+        jitter = self.acl_difficulty * s_bbox[2:4].mean() * 0.3
+        s_center[0] += random.gauss(0, jitter) if jitter > 0 else 0
+        s_center[1] += random.gauss(0, jitter) if jitter > 0 else 0
+        search = crop_and_resize(s_img, s_center, s_crop_size, self.search_size)
+        # Compute GT in search crop coordinates
+        # Target center relative to crop center, then scaled to search_size
+        scale = self.search_size / s_crop_size
+        cx_in_search = (s_bbox[0] + s_bbox[2] / 2 - s_center[0] + s_crop_size / 2) * scale
+        cy_in_search = (s_bbox[1] + s_bbox[3] / 2 - s_center[1] + s_crop_size / 2) * scale
+        w_in_search = s_bbox[2] * scale
+        h_in_search = s_bbox[3] * scale
+        # Clamp to search region
+        cx_in_search = max(0, min(self.search_size, cx_in_search))
+        cy_in_search = max(0, min(self.search_size, cy_in_search))
+        w_in_search = max(1, min(self.search_size, w_in_search))
+        h_in_search = max(1, min(self.search_size, h_in_search))
+        # Convert to tensors [0, 1]
+        template = torch.from_numpy(template).float().permute(2, 0, 1) / 255.0
+        search = torch.from_numpy(search).float().permute(2, 0, 1) / 255.0
+        bbox_tensor = torch.tensor([cx_in_search, cy_in_search, w_in_search, h_in_search])
+        # Apply augmentations
+        if self.augmentation is not None:
+            template, search, bbox_tensor = self.augmentation(template, search, bbox_tensor)
+        # Generate GT heatmap
+        stride = self.search_size / self.feat_size
+        cx_feat = bbox_tensor[0].item() / stride
+        cy_feat = bbox_tensor[1].item() / stride
+        y = torch.arange(self.feat_size, dtype=torch.float32)
+        x = torch.arange(self.feat_size, dtype=torch.float32)
+        yy, xx = torch.meshgrid(y, x, indexing='ij')
+        # Adaptive sigma based on target size (smaller targets = sharper heatmap)
+        sigma = max(1.0, min(3.0, (w_in_search + h_in_search) / (2 * stride * 4)))
+        dist_sq = (xx - cx_feat) ** 2 + (yy - cy_feat) ** 2
+        heatmap = torch.exp(-dist_sq / (2 * sigma ** 2)).unsqueeze(0)
+        # Normalized size
+        size = torch.tensor([bbox_tensor[2].item() / self.search_size,
+                             bbox_tensor[3].item() / self.search_size])
+        return {
+            'template': template,
+            'search': search,
+            'heatmap': heatmap,
+            'size': size,
+            'boxes': bbox_tensor,
+        }
+    def set_acl_difficulty(self, difficulty: float):
+        """Update ACL difficulty level (0.0 = easy, 1.0 = hard)."""
+        self.acl_difficulty = min(1.0, max(0.0, difficulty))
+# ============================================================
+# GOT-10k dataset loader
+# ============================================================
+class GOT10kDataset(SequenceDataset):
+    """GOT-10k tracking dataset.
+    Structure:
+        root/train/GOT-10k_Train_NNNNNN/
+            00000001.jpg, 00000002.jpg, ...
+            groundtruth.txt     # x,y,w,h per line
+    """
+    def __init__(self, root: str, split: str = 'train', **kwargs):
+        super().__init__(**kwargs)
+        self.root = Path(root)
+        self._load_sequences(split)
+    def _load_sequences(self, split):
+        split_dir = self.root / split
+        if not split_dir.exists():
+            print(f"Warning: GOT-10k {split} not found at {split_dir}")
+            return
+        seq_dirs = sorted([d for d in split_dir.iterdir() if d.is_dir() and 'Train' in d.name])
+        print(f"Loading GOT-10k {split}: found {len(seq_dirs)} sequences")
+        for seq_dir in seq_dirs:
+            gt_file = seq_dir / 'groundtruth.txt'
+            if not gt_file.exists():
+                continue
+            # Load annotations
+            gt_boxes = []
+            with open(gt_file, 'r') as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        gt_boxes.append(None)
+                        continue
+                    parts = line.replace(',', ' ').split()
+                    try:
+                        gt_boxes.append([float(x) for x in parts[:4]])
+                    except ValueError:
+                        gt_boxes.append(None)
+            # Get frame paths
+            frames = sorted(glob.glob(str(seq_dir / '*.jpg')))
+            if not frames:
+                frames = sorted(glob.glob(str(seq_dir / '*.png')))
+            if len(frames) != len(gt_boxes):
+                # Trim to shorter
+                min_len = min(len(frames), len(gt_boxes))
+                frames = frames[:min_len]
+                gt_boxes = gt_boxes[:min_len]
+            if len(frames) >= 2:
+                self.sequences.append({'frames': frames, 'gt': gt_boxes})
+        print(f"  Loaded {len(self.sequences)} GOT-10k sequences")
+# ============================================================
+# LaSOT dataset loader
+# ============================================================
+class LaSOTDataset(SequenceDataset):
+    """LaSOT tracking dataset.
+    Structure:
+        root/
+            airplane/
+                airplane-1/
+                    img/
+                        00000001.jpg, ...
+                    groundtruth.txt     # x,y,w,h per line
+                ...
+    """
+    def __init__(self, root: str, split: str = 'train', **kwargs):
+        super().__init__(**kwargs)
+        self.root = Path(root)
+        self._load_sequences(split)
+    def _load_sequences(self, split):
+        if not self.root.exists():
+            print(f"Warning: LaSOT not found at {self.root}")
+            return
+        # LaSOT train/test split defined by sequence names
+        # Training: first 80% of sequences per category
+        categories = sorted([d for d in self.root.iterdir() if d.is_dir()])
+        total_seqs = 0
+        for cat_dir in categories:
+            seq_dirs = sorted([d for d in cat_dir.iterdir() if d.is_dir()])
+            # Train/test split
+            if split == 'train':
+                seq_dirs = seq_dirs[:int(len(seq_dirs) * 0.8)]
+            else:
+                seq_dirs = seq_dirs[int(len(seq_dirs) * 0.8):]
+            for seq_dir in seq_dirs:
+                gt_file = seq_dir / 'groundtruth.txt'
+                img_dir = seq_dir / 'img'
+                if not gt_file.exists() or not img_dir.exists():
+                    continue
+                # Load annotations
+                gt_boxes = []
+                with open(gt_file, 'r') as f:
+                    for line in f:
+                        line = line.strip()
+                        if not line:
+                            gt_boxes.append(None)
+                            continue
+                        parts = line.replace(',', ' ').split()
+                        try:
+                            gt_boxes.append([float(x) for x in parts[:4]])
+                        except ValueError:
+                            gt_boxes.append(None)
+                frames = sorted(glob.glob(str(img_dir / '*.jpg')))
+                if len(frames) != len(gt_boxes):
+                    min_len = min(len(frames), len(gt_boxes))
+                    frames = frames[:min_len]
+                    gt_boxes = gt_boxes[:min_len]
+                if len(frames) >= 2:
+                    self.sequences.append({'frames': frames, 'gt': gt_boxes})
+                    total_seqs += 1
+        print(f"  Loaded {total_seqs} LaSOT {split} sequences across {len(categories)} categories")
+# ============================================================
+# TrackingNet dataset loader
+# ============================================================
+class TrackingNetDataset(SequenceDataset):
+    """TrackingNet tracking dataset.
+    Structure:
+        root/
+            TRAIN_0/
+                frames/
+                    video_name/
+                        0.jpg, 1.jpg, ...
+                anno/
+                    video_name.txt     # x,y,w,h per line
+            TRAIN_1/
+            ...
+    """
+    def __init__(self, root: str, chunks: list = None, **kwargs):
+        super().__init__(**kwargs)
+        self.root = Path(root)
+        if chunks is None:
+            chunks = list(range(12))  # TRAIN_0 through TRAIN_11
+        self._load_sequences(chunks)
+    def _load_sequences(self, chunks):
+        if not self.root.exists():
+            print(f"Warning: TrackingNet not found at {self.root}")
+            return
+        total_seqs = 0
+        for chunk_idx in chunks:
+            chunk_dir = self.root / f'TRAIN_{chunk_idx}'
+            if not chunk_dir.exists():
+                continue
+            anno_dir = chunk_dir / 'anno'
+            frames_dir = chunk_dir / 'frames'
+            if not anno_dir.exists() or not frames_dir.exists():
+                continue
+            for anno_file in sorted(anno_dir.glob('*.txt')):
+                seq_name = anno_file.stem
+                seq_frames_dir = frames_dir / seq_name
+                if not seq_frames_dir.exists():
+                    continue
+                # Load annotations
+                gt_boxes = []
+                with open(anno_file, 'r') as f:
+                    for line in f:
+                        line = line.strip()
+                        if not line:
+                            gt_boxes.append(None)
+                            continue
+                        parts = line.replace(',', ' ').split()
+                        try:
+                            gt_boxes.append([float(x) for x in parts[:4]])
+                        except ValueError:
+                            gt_boxes.append(None)
+                frames = sorted(glob.glob(str(seq_frames_dir / '*.jpg')))
+                if not frames:
+                    frames = sorted(glob.glob(str(seq_frames_dir / '*.png')))
+                if len(frames) != len(gt_boxes):
+                    min_len = min(len(frames), len(gt_boxes))
+                    frames = frames[:min_len]
+                    gt_boxes = gt_boxes[:min_len]
+                if len(frames) >= 2:
+                    self.sequences.append({'frames': frames, 'gt': gt_boxes})
+                    total_seqs += 1
+        print(f"  Loaded {total_seqs} TrackingNet sequences from {len(chunks)} chunks")
+# ============================================================
+# COCO detection as pseudo-sequences
+# ============================================================
+class COCODetDataset(SequenceDataset):
+    """COCO detection images as pseudo-sequences for pretraining.
+    Each image with a valid bounding box becomes a length-1 "sequence"
+    where template and search are crops from the same image.
+    """
+    def __init__(self, root: str, ann_file: str = None, **kwargs):
+        super().__init__(**kwargs)
+        self.root = Path(root)
+        self._load_annotations(ann_file)
+    def _load_annotations(self, ann_file):
+        if ann_file is None:
+            ann_file = str(self.root.parent / 'annotations' / 'instances_train2017.json')
+        if not os.path.exists(ann_file):
+            print(f"Warning: COCO annotations not found at {ann_file}")
+            return
+        try:
+            import json
+            with open(ann_file, 'r') as f:
+                coco = json.load(f)
+            # Build image lookup
+            images = {img['id']: img for img in coco['images']}
+            # Create pseudo-sequences from annotations
+            for ann in coco['annotations']:
+                if ann.get('iscrowd', 0):
+                    continue
+                bbox = ann['bbox']  # [x, y, w, h]
+                if bbox[2] < 10 or bbox[3] < 10:
+                    continue
+                img_info = images.get(ann['image_id'])
+                if img_info is None:
+                    continue
+                img_path = str(self.root / img_info['file_name'])
+                if os.path.exists(img_path):
+                    # Pseudo-sequence: same frame for template and search
+                    self.sequences.append({
+                        'frames': [img_path, img_path],
+                        'gt': [bbox, bbox],
+                    })
+            print(f"  Loaded {len(self.sequences)} COCO pseudo-sequences")
+        except Exception as e:
+            print(f"Warning: Failed to load COCO annotations: {e}")
+# ============================================================
+# Synthetic dataset (for testing / no-data development)
+# ============================================================
+class SyntheticTrackingDataset(Dataset):
+    """Synthetic tracking dataset for testing without real data.
+    Generates colored rectangles on noise backgrounds with controlled
+    position jitter based on ACL difficulty.
+    """
+    def __init__(
+        self,
+        length: int = 10000,
+        template_size: int = 128,
+        search_size: int = 256,
+        feat_size: int = 16,
+        acl_difficulty: float = 1.0,
+    ):
+        super().__init__()
+        self.length = length
+        self.template_size = template_size
+        self.search_size = search_size
+        self.feat_size = feat_size
+        self.acl_difficulty = acl_difficulty
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
         rng = random.Random(idx)
         # Random target size (relative to search region)
         cx = max(target_w / 2, min(self.search_size - target_w / 2, cx))
         cy = max(target_h / 2, min(self.search_size - target_h / 2, cy))
+        # Create synthetic images
         template = torch.randn(3, self.template_size, self.template_size) * 0.1
         search = torch.randn(3, self.search_size, self.search_size) * 0.1
         t_half_h = int(min(target_h / 2, self.template_size / 2 - 1))
         tc = self.template_size // 2
         color = torch.tensor([rng.random(), rng.random(), rng.random()]).view(3, 1, 1)
+        template[:, tc - t_half_h:tc + t_half_h, tc - t_half_w:tc + t_half_w] = color
         # Draw target in search region
         sx1 = max(0, int(cx - target_w / 2))
         dist_sq = (xx - cx_feat) ** 2 + (yy - cy_feat) ** 2
         heatmap = torch.exp(-dist_sq / (2 * sigma ** 2)).unsqueeze(0)
         size = torch.tensor([target_w / self.search_size, target_h / self.search_size])
         boxes = torch.tensor([cx, cy, target_w, target_h])
         return {
             'boxes': boxes,
         }
     def set_acl_difficulty(self, difficulty: float):
         self.acl_difficulty = min(1.0, max(0.0, difficulty))
+# ============================================================
+# Convenience: build combined dataset
+# ============================================================
+def build_tracking_dataset(
+    data_config: dict,
+    template_size: int = 128,
+    search_size: int = 256,
+    feat_size: int = 16,
+    acl_difficulty: float = 0.0,
+) -> Dataset:
+    """Build a combined tracking dataset from multiple sources.
+    Args:
+        data_config: dict with optional keys:
+            - 'got10k_root': path to GOT-10k dataset
+            - 'lasot_root': path to LaSOT dataset
+            - 'trackingnet_root': path to TrackingNet dataset
+            - 'coco_root': path to COCO train2017 images
+            - 'synthetic_length': number of synthetic samples (fallback)
+        template_size: template crop size
+        search_size: search region crop size
+        feat_size: feature map spatial size
+        acl_difficulty: initial ACL difficulty
+    Returns:
+        ConcatDataset or SyntheticTrackingDataset
+    """
+    common_kwargs = dict(
+        template_size=template_size,
+        search_size=search_size,
+        feat_size=feat_size,
+        acl_difficulty=acl_difficulty,
+    )
+    datasets = []
+    if 'got10k_root' in data_config and os.path.exists(data_config['got10k_root']):
+        ds = GOT10kDataset(data_config['got10k_root'], split='train', **common_kwargs)
+        if len(ds) > 0:
+            datasets.append(ds)
+            print(f"GOT-10k: {len(ds)} sequences")
+    if 'lasot_root' in data_config and os.path.exists(data_config['lasot_root']):
+        ds = LaSOTDataset(data_config['lasot_root'], split='train', **common_kwargs)
+        if len(ds) > 0:
+            datasets.append(ds)
+            print(f"LaSOT: {len(ds)} sequences")
+    if 'trackingnet_root' in data_config and os.path.exists(data_config['trackingnet_root']):
+        ds = TrackingNetDataset(data_config['trackingnet_root'], **common_kwargs)
+        if len(ds) > 0:
+            datasets.append(ds)
+            print(f"TrackingNet: {len(ds)} sequences")
+    if 'coco_root' in data_config and os.path.exists(data_config['coco_root']):
+        ds = COCODetDataset(data_config['coco_root'], **common_kwargs)
+        if len(ds) > 0:
+            datasets.append(ds)
+            print(f"COCO: {len(ds)} pseudo-sequences")
+    if datasets:
+        combined = ConcatDataset(datasets)
+        print(f"\nTotal training samples: {len(combined)}")
+        return combined
+    # Fallback to synthetic
+    syn_len = data_config.get('synthetic_length', 10000)
+    print(f"No real data found, using {syn_len} synthetic samples")
+    return SyntheticTrackingDataset(
+        length=syn_len,
+        template_size=template_size,
+        search_size=search_size,
+        feat_size=feat_size,
+        acl_difficulty=acl_difficulty,
+    )
+# ============================================================
+# Legacy alias for backward compatibility
+# ============================================================
+class TrackingDataset(SyntheticTrackingDataset):
+    """Backward-compatible alias for SyntheticTrackingDataset."""
+    def __init__(self, data_dir=None, split='train', synthetic=False,
+                 synthetic_length=10000, **kwargs):
+        super().__init__(length=synthetic_length, **kwargs)