File size: 48,454 Bytes

"""
Tracking dataset with real dataset loaders and synthetic fallback.

Supports:
- GOT-10k: train split (~10k sequences, annotations in groundtruth.txt)
- LaSOT: training split (1120 sequences, 14 categories)
- TrackingNet: training split (30k+ sequences, annotations in anno/)
- COCO detection: for static pair pretraining (bbox crops as pseudo-sequences)
- Synthetic data generation for testing (no external data needed)
- ACL (Adaptive Curriculum Learning) difficulty scaling
- Standard tracking augmentations: spatial jitter, horizontal flip, color jitter,
  grayscale, Gaussian blur, brightness/contrast

Each sample produces a (template, search) pair from the same video sequence
with controlled temporal distance, plus GT annotations.

Dataset directory structure expected:
    GOT-10k/
        train/
            GOT-10k_Train_000001/
                00000001.jpg, 00000002.jpg, ...
                groundtruth.txt          # x,y,w,h per line
            ...
    LaSOT/
        airplane/
            airplane-1/
                img/
                    00000001.jpg, ...
                groundtruth.txt          # x,y,w,h per line
            ...
    TrackingNet/
        TRAIN_0/
            frames/
                video_name/
                    0.jpg, 1.jpg, ...
            anno/
                video_name.txt           # x,y,w,h per line
        ...
    COCO/
        train2017/
            *.jpg
        annotations/
            instances_train2017.json
"""

import os
import math
import glob
import random
import torch
import numpy as np
from pathlib import Path
from torch.utils.data import Dataset, ConcatDataset


# ============================================================
# Augmentations (no torchvision dependency, works with tensors)
# ============================================================

class TrackingAugmentation:
    """Standard tracking augmentations applied to (template, search) pairs.
    
    Augmentations preserve the spatial relationship between search region
    and GT bounding box by applying augmentations consistently.
    """
    
    def __init__(
        self,
        brightness: float = 0.2,
        contrast: float = 0.2,
        saturation: float = 0.2,
        grayscale_prob: float = 0.05,
        horizontal_flip_prob: float = 0.5,
        blur_prob: float = 0.1,
        blur_sigma: tuple = (0.1, 2.0),
    ):
        self.brightness = brightness
        self.contrast = contrast
        self.saturation = saturation
        self.grayscale_prob = grayscale_prob
        self.horizontal_flip_prob = horizontal_flip_prob
        self.blur_prob = blur_prob
        self.blur_sigma = blur_sigma
    
    def __call__(self, template: torch.Tensor, search: torch.Tensor,
                 bbox: torch.Tensor) -> tuple:
        """
        Args:
            template: (3, H_t, W_t) tensor in [0, 1]
            search: (3, H_s, W_s) tensor in [0, 1]
            bbox: (4,) tensor [cx, cy, w, h] in search region pixels
        Returns:
            template, search, bbox (augmented)
        """
        # Color jitter (same for template and search to maintain appearance consistency)
        if random.random() < 0.8:
            # Brightness
            factor = 1.0 + random.uniform(-self.brightness, self.brightness)
            template = (template * factor).clamp(0, 1)
            search = (search * factor).clamp(0, 1)
            
            # Contrast
            factor = 1.0 + random.uniform(-self.contrast, self.contrast)
            t_mean = template.mean()
            s_mean = search.mean()
            template = ((template - t_mean) * factor + t_mean).clamp(0, 1)
            search = ((search - s_mean) * factor + s_mean).clamp(0, 1)
        
        # Grayscale
        if random.random() < self.grayscale_prob:
            t_gray = template.mean(dim=0, keepdim=True).expand_as(template)
            s_gray = search.mean(dim=0, keepdim=True).expand_as(search)
            template = t_gray
            search = s_gray
        
        # Horizontal flip (must also flip bbox cx)
        if random.random() < self.horizontal_flip_prob:
            template = template.flip(-1)
            search = search.flip(-1)
            W_s = search.shape[-1]
            bbox = bbox.clone()
            bbox[0] = W_s - bbox[0]  # flip cx
        
        # Gaussian blur (search only — simulates motion blur)
        if random.random() < self.blur_prob:
            sigma = random.uniform(*self.blur_sigma)
            kernel_size = int(2 * round(3 * sigma) + 1)
            if kernel_size >= 3:
                search = self._gaussian_blur(search, kernel_size, sigma)
        
        return template, search, bbox
    
    @staticmethod
    def _gaussian_blur(img: torch.Tensor, kernel_size: int, sigma: float) -> torch.Tensor:
        """Apply Gaussian blur to a (C, H, W) tensor."""
        import torch.nn.functional as F
        
        # Create 1D Gaussian kernel
        x = torch.arange(kernel_size, dtype=img.dtype, device=img.device) - kernel_size // 2
        kernel_1d = torch.exp(-0.5 * (x / sigma) ** 2)
        kernel_1d = kernel_1d / kernel_1d.sum()
        
        # Apply separable 2D blur
        pad = kernel_size // 2
        img = img.unsqueeze(0)  # (1, C, H, W)
        
        # Horizontal
        k_h = kernel_1d.view(1, 1, 1, -1).expand(img.shape[1], -1, -1, -1)
        img = F.conv2d(F.pad(img, (pad, pad, 0, 0), mode='reflect'),
                       k_h, groups=img.shape[1])
        
        # Vertical
        k_v = kernel_1d.view(1, 1, -1, 1).expand(img.shape[1], -1, -1, -1)
        img = F.conv2d(F.pad(img, (0, 0, pad, pad), mode='reflect'),
                       k_v, groups=img.shape[1])
        
        return img.squeeze(0)


# ============================================================
# Crop utilities
# ============================================================

def crop_and_resize(image: np.ndarray, center: np.ndarray, size: float,
                    output_size: int) -> np.ndarray:
    """Crop a square region from image, centered at center, with given size.
    
    Args:
        image: (H, W, 3) numpy array, uint8 or float
        center: (2,) [cx, cy] in image coordinates
        size: side length of the square crop
        output_size: resize crop to (output_size, output_size)
    Returns:
        (output_size, output_size, 3) numpy array
    """
    H, W = image.shape[:2]
    half = size / 2
    
    x1 = int(round(center[0] - half))
    y1 = int(round(center[1] - half))
    x2 = int(round(center[0] + half))
    y2 = int(round(center[1] + half))
    
    # Boundary padding
    pad_left = max(0, -x1)
    pad_top = max(0, -y1)
    pad_right = max(0, x2 - W)
    pad_bottom = max(0, y2 - H)
    
    x1c = max(0, x1)
    y1c = max(0, y1)
    x2c = min(W, x2)
    y2c = min(H, y2)
    
    crop = image[y1c:y2c, x1c:x2c]
    
    if pad_left > 0 or pad_top > 0 or pad_right > 0 or pad_bottom > 0:
        mean_color = image.mean(axis=(0, 1))
        padded = np.full((crop.shape[0] + pad_top + pad_bottom,
                          crop.shape[1] + pad_left + pad_right, 3),
                         mean_color, dtype=crop.dtype)
        padded[pad_top:pad_top + crop.shape[0], pad_left:pad_left + crop.shape[1]] = crop
        crop = padded
    
    # Resize
    if crop.shape[0] > 0 and crop.shape[1] > 0:
        import torch.nn.functional as F
        crop_t = torch.from_numpy(crop.copy()).float().permute(2, 0, 1).unsqueeze(0)
        crop_t = F.interpolate(crop_t, size=(output_size, output_size),
                               mode='bilinear', align_corners=False)
        crop = crop_t.squeeze(0).permute(1, 2, 0).numpy()
    else:
        crop = np.zeros((output_size, output_size, 3), dtype=np.float32)
    
    return crop


def compute_crop_params(bbox: np.ndarray, context_factor: float = 2.0) -> tuple:
    """Compute crop center and size from bbox with context.
    
    Args:
        bbox: [x, y, w, h] bounding box
        context_factor: how much context around bbox (2.0 = 2x target size)
    Returns:
        center: (2,) [cx, cy]
        crop_size: scalar side length
    """
    x, y, w, h = bbox
    cx = x + w / 2
    cy = y + h / 2
    
    # Context amount following STARK/OSTrack convention:
    # s = sqrt((w + 2p) * (h + 2p)), where p = (w + h) / 2
    p = (w + h) / 2
    crop_size = math.sqrt((w + p) * (h + p)) * context_factor
    crop_size = max(crop_size, 10)
    
    return np.array([cx, cy]), crop_size


# ============================================================
# Base sequence dataset
# ============================================================

class SequenceDataset(Dataset):
    """Base class for tracking sequence datasets.
    
    Returns K-frame clips: template + K consecutive search frames.
    The mLSTM processes these as one long sequence where memory carries
    information across frames — this is the core training paradigm.
    
    Subclasses must populate self.sequences with list of:
        {'frames': [path1, path2, ...], 'gt': [[x,y,w,h], ...]}
    """
    
    def __init__(
        self,
        template_size: int = 128,
        search_size: int = 256,
        feat_size: int = 16,
        acl_difficulty: float = 1.0,
        max_gap: int = 100,
        clip_length: int = 3,
        augmentation: bool = True,
    ):
        super().__init__()
        self.template_size = template_size
        self.search_size = search_size
        self.feat_size = feat_size
        self.acl_difficulty = acl_difficulty
        self.max_gap = max_gap
        self.clip_length = clip_length  # K search frames per sample
        self.sequences = []
        
        self.augmentation = TrackingAugmentation() if augmentation else None
    
    def __len__(self):
        return len(self.sequences)
    
    def _load_image(self, path: str) -> np.ndarray:
        """Load image from path. Returns (H, W, 3) float32 in [0, 255]."""
        try:
            from PIL import Image
            img = Image.open(path).convert('RGB')
            return np.array(img, dtype=np.float32)
        except ImportError:
            import cv2
            img = cv2.imread(path)
            if img is None:
                return np.zeros((480, 640, 3), dtype=np.float32)
            return cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
    
    def _sample_clip(self, idx: int) -> list:
        """Sample a clip: template frame + K consecutive search frames.
        
        Returns:
            list of frame indices: [template_idx, search_1_idx, ..., search_K_idx]
        """
        seq = self.sequences[idx]
        n_frames = len(seq['frames'])
        K = self.clip_length
        
        valid = [i for i in range(n_frames) 
                 if seq['gt'][i] is not None and seq['gt'][i][2] > 0 and seq['gt'][i][3] > 0]
        valid_set = set(valid)
        
        if len(valid) < K + 1:
            # Not enough frames — repeat what we have
            if len(valid) == 0:
                return [0] * (K + 1)
            return [valid[0]] + [valid[min(i, len(valid)-1)] for i in range(K)]
        
        # Template: pick a random valid frame
        t_idx = random.choice(valid)
        
        # Search frames: K consecutive valid frames AFTER template
        # Temporal gap between template and first search controlled by ACL
        effective_gap = max(1, int(self.max_gap * self.acl_difficulty))
        
        # Find the start of the search clip: somewhere after template
        min_start = t_idx + 1
        max_start = min(t_idx + effective_gap, n_frames - K)
        
        if max_start < min_start:
            # Try before template
            max_start_before = t_idx - K
            min_start_before = max(0, t_idx - effective_gap - K)
            if max_start_before >= min_start_before and max_start_before >= 0:
                clip_start = random.randint(min_start_before, max_start_before)
            else:
                # Fallback: just use whatever consecutive frames we can find
                clip_start = max(0, min(n_frames - K, t_idx + 1))
            # But ensure template is different from search frames
        else:
            clip_start = random.randint(min_start, max(min_start, max_start))
        
        # Collect K consecutive frames, preferring valid ones
        search_indices = []
        for i in range(clip_start, min(clip_start + K * 3, n_frames)):
            if i in valid_set and i != t_idx:
                search_indices.append(i)
            if len(search_indices) == K:
                break
        
        # Pad if we didn't find enough
        while len(search_indices) < K:
            search_indices.append(search_indices[-1] if search_indices else t_idx)
        
        return [t_idx] + search_indices[:K]
    
    def _process_frame(self, img: np.ndarray, bbox: np.ndarray, is_template: bool):
        """Crop and preprocess a single frame.
        
        Returns:
            image_tensor: (3, H, W) float [0, 1]
            bbox_in_crop: (4,) [cx, cy, w, h] in crop coordinates
        """
        if is_template:
            center, crop_size = compute_crop_params(bbox, context_factor=2.0)
            output_size = self.template_size
        else:
            center, crop_size = compute_crop_params(bbox, context_factor=4.0)
            output_size = self.search_size
            # Spatial jitter for search (controlled by ACL)
            jitter = self.acl_difficulty * bbox[2:4].mean() * 0.3
            if jitter > 0:
                center[0] += random.gauss(0, jitter)
                center[1] += random.gauss(0, jitter)
        
        crop = crop_and_resize(img, center, crop_size, output_size)
        
        # Compute GT in crop coordinates
        scale = output_size / crop_size
        cx = (bbox[0] + bbox[2] / 2 - center[0] + crop_size / 2) * scale
        cy = (bbox[1] + bbox[3] / 2 - center[1] + crop_size / 2) * scale
        w = bbox[2] * scale
        h = bbox[3] * scale
        
        cx = max(0, min(output_size, cx))
        cy = max(0, min(output_size, cy))
        w = max(1, min(output_size, w))
        h = max(1, min(output_size, h))
        
        tensor = torch.from_numpy(crop).float().permute(2, 0, 1) / 255.0
        bbox_crop = torch.tensor([cx, cy, w, h])
        
        return tensor, bbox_crop
    
    def _make_heatmap(self, bbox: torch.Tensor):
        """Generate GT heatmap from bbox in search crop coordinates."""
        stride = self.search_size / self.feat_size
        cx_feat = bbox[0].item() / stride
        cy_feat = bbox[1].item() / stride
        w_search = bbox[2].item()
        h_search = bbox[3].item()
        
        y = torch.arange(self.feat_size, dtype=torch.float32)
        x = torch.arange(self.feat_size, dtype=torch.float32)
        yy, xx = torch.meshgrid(y, x, indexing='ij')
        
        sigma = max(1.0, min(3.0, (w_search + h_search) / (2 * stride * 4)))
        dist_sq = (xx - cx_feat) ** 2 + (yy - cy_feat) ** 2
        heatmap = torch.exp(-dist_sq / (2 * sigma ** 2)).unsqueeze(0)
        return heatmap
    
    def __getitem__(self, idx):
        seq = self.sequences[idx % len(self.sequences)]
        clip_indices = self._sample_clip(idx % len(self.sequences))
        
        t_idx = clip_indices[0]
        s_indices = clip_indices[1:]
        K = len(s_indices)
        
        # Load and process template
        t_img = self._load_image(seq['frames'][t_idx])
        t_bbox = np.array(seq['gt'][t_idx], dtype=np.float32)
        template, _ = self._process_frame(t_img, t_bbox, is_template=True)
        
        # Load and process K search frames
        searches = []
        heatmaps = []
        sizes = []
        boxes = []
        
        for s_idx in s_indices:
            s_img = self._load_image(seq['frames'][s_idx])
            s_bbox = np.array(seq['gt'][s_idx], dtype=np.float32)
            search, bbox_crop = self._process_frame(s_img, s_bbox, is_template=False)
            
            # Apply augmentation (same color transform for template+search consistency)
            if self.augmentation is not None:
                template_aug, search, bbox_crop = self.augmentation(template, search, bbox_crop)
                # Only use augmented template from first search frame to keep consistency
                if len(searches) == 0:
                    template = template_aug
            
            searches.append(search)
            heatmaps.append(self._make_heatmap(bbox_crop))
            sizes.append(torch.tensor([bbox_crop[2].item() / self.search_size,
                                       bbox_crop[3].item() / self.search_size]))
            boxes.append(bbox_crop)
        
        return {
            'template': template,                          # (3, 128, 128)
            'searches': torch.stack(searches, dim=0),      # (K, 3, 256, 256)
            'heatmaps': torch.stack(heatmaps, dim=0),      # (K, 1, 16, 16)
            'sizes': torch.stack(sizes, dim=0),             # (K, 2)
            'boxes': torch.stack(boxes, dim=0),             # (K, 4)
        }
    
    def set_acl_difficulty(self, difficulty: float):
        """Update ACL difficulty level (0.0 = easy, 1.0 = hard)."""
        self.acl_difficulty = min(1.0, max(0.0, difficulty))


# ============================================================
# GOT-10k dataset loader
# ============================================================

class GOT10kDataset(SequenceDataset):
    """GOT-10k tracking dataset.
    
    Structure:
        root/train/GOT-10k_Train_NNNNNN/
            00000001.jpg, 00000002.jpg, ...
            groundtruth.txt     # x,y,w,h per line
    """
    
    def __init__(self, root: str, split: str = 'train', **kwargs):
        super().__init__(**kwargs)
        self.root = Path(root)
        self._load_sequences(split)
    
    def _load_sequences(self, split):
        split_dir = self.root / split
        if not split_dir.exists():
            print(f"Warning: GOT-10k {split} not found at {split_dir}")
            return
        
        seq_dirs = sorted([d for d in split_dir.iterdir() if d.is_dir() and 'Train' in d.name])
        print(f"Loading GOT-10k {split}: found {len(seq_dirs)} sequences")
        
        for seq_dir in seq_dirs:
            gt_file = seq_dir / 'groundtruth.txt'
            if not gt_file.exists():
                continue
            
            # Load annotations
            gt_boxes = []
            with open(gt_file, 'r') as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        gt_boxes.append(None)
                        continue
                    parts = line.replace(',', ' ').split()
                    try:
                        gt_boxes.append([float(x) for x in parts[:4]])
                    except ValueError:
                        gt_boxes.append(None)
            
            # Get frame paths
            frames = sorted(glob.glob(str(seq_dir / '*.jpg')))
            if not frames:
                frames = sorted(glob.glob(str(seq_dir / '*.png')))
            
            if len(frames) != len(gt_boxes):
                # Trim to shorter
                min_len = min(len(frames), len(gt_boxes))
                frames = frames[:min_len]
                gt_boxes = gt_boxes[:min_len]
            
            if len(frames) >= 2:
                self.sequences.append({'frames': frames, 'gt': gt_boxes})
        
        print(f"  Loaded {len(self.sequences)} GOT-10k sequences")


# ============================================================
# LaSOT dataset loader
# ============================================================

class LaSOTDataset(SequenceDataset):
    """LaSOT tracking dataset.
    
    Structure:
        root/
            airplane/
                airplane-1/
                    img/
                        00000001.jpg, ...
                    groundtruth.txt     # x,y,w,h per line
                ...
    """
    
    def __init__(self, root: str, split: str = 'train', **kwargs):
        super().__init__(**kwargs)
        self.root = Path(root)
        self._load_sequences(split)
    
    def _load_sequences(self, split):
        if not self.root.exists():
            print(f"Warning: LaSOT not found at {self.root}")
            return
        
        # LaSOT train/test split defined by sequence names
        # Training: first 80% of sequences per category
        categories = sorted([d for d in self.root.iterdir() if d.is_dir()])
        total_seqs = 0
        
        for cat_dir in categories:
            seq_dirs = sorted([d for d in cat_dir.iterdir() if d.is_dir()])
            
            # Train/test split
            if split == 'train':
                seq_dirs = seq_dirs[:int(len(seq_dirs) * 0.8)]
            else:
                seq_dirs = seq_dirs[int(len(seq_dirs) * 0.8):]
            
            for seq_dir in seq_dirs:
                gt_file = seq_dir / 'groundtruth.txt'
                img_dir = seq_dir / 'img'
                
                if not gt_file.exists() or not img_dir.exists():
                    continue
                
                # Load annotations
                gt_boxes = []
                with open(gt_file, 'r') as f:
                    for line in f:
                        line = line.strip()
                        if not line:
                            gt_boxes.append(None)
                            continue
                        parts = line.replace(',', ' ').split()
                        try:
                            gt_boxes.append([float(x) for x in parts[:4]])
                        except ValueError:
                            gt_boxes.append(None)
                
                frames = sorted(glob.glob(str(img_dir / '*.jpg')))
                
                if len(frames) != len(gt_boxes):
                    min_len = min(len(frames), len(gt_boxes))
                    frames = frames[:min_len]
                    gt_boxes = gt_boxes[:min_len]
                
                if len(frames) >= 2:
                    self.sequences.append({'frames': frames, 'gt': gt_boxes})
                    total_seqs += 1
        
        print(f"  Loaded {total_seqs} LaSOT {split} sequences across {len(categories)} categories")


# ============================================================
# TrackingNet dataset loader
# ============================================================

class TrackingNetDataset(SequenceDataset):
    """TrackingNet tracking dataset.
    
    Structure:
        root/
            TRAIN_0/
                frames/
                    video_name/
                        0.jpg, 1.jpg, ...
                anno/
                    video_name.txt     # x,y,w,h per line
            TRAIN_1/
            ...
    """
    
    def __init__(self, root: str, chunks: list = None, **kwargs):
        super().__init__(**kwargs)
        self.root = Path(root)
        if chunks is None:
            chunks = list(range(12))  # TRAIN_0 through TRAIN_11
        self._load_sequences(chunks)
    
    def _load_sequences(self, chunks):
        if not self.root.exists():
            print(f"Warning: TrackingNet not found at {self.root}")
            return
        
        total_seqs = 0
        for chunk_idx in chunks:
            chunk_dir = self.root / f'TRAIN_{chunk_idx}'
            if not chunk_dir.exists():
                continue
            
            anno_dir = chunk_dir / 'anno'
            frames_dir = chunk_dir / 'frames'
            
            if not anno_dir.exists() or not frames_dir.exists():
                continue
            
            for anno_file in sorted(anno_dir.glob('*.txt')):
                seq_name = anno_file.stem
                seq_frames_dir = frames_dir / seq_name
                
                if not seq_frames_dir.exists():
                    continue
                
                # Load annotations
                gt_boxes = []
                with open(anno_file, 'r') as f:
                    for line in f:
                        line = line.strip()
                        if not line:
                            gt_boxes.append(None)
                            continue
                        parts = line.replace(',', ' ').split()
                        try:
                            gt_boxes.append([float(x) for x in parts[:4]])
                        except ValueError:
                            gt_boxes.append(None)
                
                frames = sorted(glob.glob(str(seq_frames_dir / '*.jpg')))
                if not frames:
                    frames = sorted(glob.glob(str(seq_frames_dir / '*.png')))
                
                if len(frames) != len(gt_boxes):
                    min_len = min(len(frames), len(gt_boxes))
                    frames = frames[:min_len]
                    gt_boxes = gt_boxes[:min_len]
                
                if len(frames) >= 2:
                    self.sequences.append({'frames': frames, 'gt': gt_boxes})
                    total_seqs += 1
        
        print(f"  Loaded {total_seqs} TrackingNet sequences from {len(chunks)} chunks")


# ============================================================
# COCO detection as pseudo-sequences
# ============================================================

class COCODetDataset(SequenceDataset):
    """COCO detection images as pseudo-sequences for pretraining.
    
    Each image with a valid bounding box becomes a length-1 "sequence"
    where template and search are crops from the same image.
    """
    
    def __init__(self, root: str, ann_file: str = None, **kwargs):
        super().__init__(**kwargs)
        self.root = Path(root)
        self._load_annotations(ann_file)
    
    def _load_annotations(self, ann_file):
        if ann_file is None:
            ann_file = str(self.root.parent / 'annotations' / 'instances_train2017.json')
        
        if not os.path.exists(ann_file):
            print(f"Warning: COCO annotations not found at {ann_file}")
            return
        
        try:
            import json
            with open(ann_file, 'r') as f:
                coco = json.load(f)
            
            # Build image lookup
            images = {img['id']: img for img in coco['images']}
            
            # Create pseudo-sequences from annotations
            for ann in coco['annotations']:
                if ann.get('iscrowd', 0):
                    continue
                bbox = ann['bbox']  # [x, y, w, h]
                if bbox[2] < 10 or bbox[3] < 10:
                    continue
                
                img_info = images.get(ann['image_id'])
                if img_info is None:
                    continue
                
                img_path = str(self.root / img_info['file_name'])
                if os.path.exists(img_path):
                    # Pseudo-sequence: same frame for template and search
                    self.sequences.append({
                        'frames': [img_path, img_path],
                        'gt': [bbox, bbox],
                    })
            
            print(f"  Loaded {len(self.sequences)} COCO pseudo-sequences")
        
        except Exception as e:
            print(f"Warning: Failed to load COCO annotations: {e}")


# ============================================================
# Synthetic dataset (for testing / no-data development)
# ============================================================

class SyntheticTrackingDataset(Dataset):
    """Synthetic tracking dataset for testing without real data.
    
    Generates K-frame clips: template + K search frames with a moving
    colored rectangle target. Motion is linear with noise.
    """
    
    def __init__(
        self,
        length: int = 10000,
        template_size: int = 128,
        search_size: int = 256,
        feat_size: int = 16,
        acl_difficulty: float = 1.0,
        clip_length: int = 3,
    ):
        super().__init__()
        self.length = length
        self.template_size = template_size
        self.search_size = search_size
        self.feat_size = feat_size
        self.acl_difficulty = acl_difficulty
        self.clip_length = clip_length
    
    def __len__(self):
        return self.length
    
    def _make_heatmap(self, cx, cy, w_search, h_search):
        stride = self.search_size / self.feat_size
        cx_feat = cx / stride
        cy_feat = cy / stride
        y = torch.arange(self.feat_size, dtype=torch.float32)
        x = torch.arange(self.feat_size, dtype=torch.float32)
        yy, xx = torch.meshgrid(y, x, indexing='ij')
        sigma = max(1.0, min(3.0, (w_search + h_search) / (2 * stride * 4)))
        dist_sq = (xx - cx_feat) ** 2 + (yy - cy_feat) ** 2
        return torch.exp(-dist_sq / (2 * sigma ** 2)).unsqueeze(0)
    
    def __getitem__(self, idx):
        rng = random.Random(idx)
        K = self.clip_length
        
        # Target appearance
        color = torch.tensor([rng.random(), rng.random(), rng.random()]).view(3, 1, 1)
        target_w = rng.uniform(0.1, 0.5) * self.search_size
        target_h = rng.uniform(0.1, 0.5) * self.search_size
        
        # Initial position (center of search)
        cx0 = self.search_size / 2
        cy0 = self.search_size / 2
        
        # Velocity (pixels per frame, scaled by difficulty)
        vx = rng.gauss(0, self.acl_difficulty * 15)
        vy = rng.gauss(0, self.acl_difficulty * 15)
        
        # Template: target at center
        template = torch.randn(3, self.template_size, self.template_size) * 0.1
        t_hw = int(min(target_w / 2, self.template_size / 2 - 1))
        t_hh = int(min(target_h / 2, self.template_size / 2 - 1))
        tc = self.template_size // 2
        template[:, tc - t_hh:tc + t_hh, tc - t_hw:tc + t_hw] = color
        
        # K search frames with moving target
        searches = []
        heatmaps = []
        sizes = []
        boxes = []
        
        for k in range(K):
            # Position at frame k
            cx = cx0 + vx * (k + 1) + rng.gauss(0, self.acl_difficulty * 5)
            cy = cy0 + vy * (k + 1) + rng.gauss(0, self.acl_difficulty * 5)
            cx = max(target_w / 2, min(self.search_size - target_w / 2, cx))
            cy = max(target_h / 2, min(self.search_size - target_h / 2, cy))
            
            search = torch.randn(3, self.search_size, self.search_size) * 0.1
            sx1 = max(0, int(cx - target_w / 2))
            sy1 = max(0, int(cy - target_h / 2))
            sx2 = min(self.search_size, int(cx + target_w / 2))
            sy2 = min(self.search_size, int(cy + target_h / 2))
            search[:, sy1:sy2, sx1:sx2] = color
            
            searches.append(search)
            heatmaps.append(self._make_heatmap(cx, cy, target_w, target_h))
            sizes.append(torch.tensor([target_w / self.search_size, 
                                       target_h / self.search_size]))
            boxes.append(torch.tensor([cx, cy, target_w, target_h]))
        
        return {
            'template': template,                          # (3, 128, 128)
            'searches': torch.stack(searches, dim=0),      # (K, 3, 256, 256)
            'heatmaps': torch.stack(heatmaps, dim=0),      # (K, 1, 16, 16)
            'sizes': torch.stack(sizes, dim=0),             # (K, 2)
            'boxes': torch.stack(boxes, dim=0),             # (K, 4)
        }
    
    def set_acl_difficulty(self, difficulty: float):
        self.acl_difficulty = min(1.0, max(0.0, difficulty))


# ============================================================
# VisDrone-SOT dataset loader (UAV)
# ============================================================

class VisDroneSOTDataset(SequenceDataset):
    """VisDrone-SOT single object tracking dataset (drone/UAV perspective).
    
    Structure:
        root/
            VisDrone2019-SOT-train/
                sequences/
                    uav0000001_00000_s/
                        0000001.jpg, 0000002.jpg, ...
                    ...
                annotations/
                    uav0000001_00000_s.txt    # x,y,w,h per line
                    ...
    
    Splits: train (86 sequences, ~70K frames), val (11 sequences), 
            test-dev (35 sequences), test-challenge (35 sequences)
    
    Key for our tracker: real drone footage with small targets, fast motion,
    viewpoint changes, and camera ego-motion — the exact conditions we deploy in.
    """
    
    def __init__(self, root: str, split: str = 'train', **kwargs):
        super().__init__(**kwargs)
        self.root = Path(root)
        self._load_sequences(split)
    
    def _load_sequences(self, split):
        # Try multiple directory naming conventions
        split_names = {
            'train': ['VisDrone2019-SOT-train', 'VisDrone2018-SOT-train', 'train'],
            'val': ['VisDrone2019-SOT-val', 'VisDrone2018-SOT-val', 'val'],
            'test': ['VisDrone2019-SOT-test-dev', 'VisDrone2018-SOT-test', 'test-dev', 'test'],
        }
        
        split_dir = None
        for name in split_names.get(split, [split]):
            candidate = self.root / name
            if candidate.exists():
                split_dir = candidate
                break
            # Also check if root itself is the split dir
            if (self.root / 'sequences').exists():
                split_dir = self.root
                break
        
        if split_dir is None:
            print(f"Warning: VisDrone-SOT {split} not found at {self.root}")
            return
        
        seq_dir = split_dir / 'sequences'
        anno_dir = split_dir / 'annotations'
        
        if not seq_dir.exists() or not anno_dir.exists():
            print(f"Warning: VisDrone-SOT missing sequences/ or annotations/ at {split_dir}")
            return
        
        total_seqs = 0
        for anno_file in sorted(anno_dir.glob('*.txt')):
            seq_name = anno_file.stem
            frames_dir = seq_dir / seq_name
            
            if not frames_dir.exists():
                continue
            
            gt_boxes = []
            with open(anno_file, 'r') as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        gt_boxes.append(None)
                        continue
                    parts = line.replace(',', ' ').split()
                    try:
                        gt_boxes.append([float(x) for x in parts[:4]])
                    except ValueError:
                        gt_boxes.append(None)
            
            frames = sorted(glob.glob(str(frames_dir / '*.jpg')))
            if not frames:
                frames = sorted(glob.glob(str(frames_dir / '*.png')))
            
            if len(frames) != len(gt_boxes):
                min_len = min(len(frames), len(gt_boxes))
                frames = frames[:min_len]
                gt_boxes = gt_boxes[:min_len]
            
            if len(frames) >= 2:
                self.sequences.append({'frames': frames, 'gt': gt_boxes})
                total_seqs += 1
        
        print(f"  Loaded {total_seqs} VisDrone-SOT {split} sequences")


# ============================================================
# UAVDT dataset loader (UAV)
# ============================================================

class UAVDTDataset(SequenceDataset):
    """UAVDT (Unmanned Aerial Vehicle Detection and Tracking) dataset.
    
    Structure:
        root/
            UAV-benchmark-S/           # SOT annotations
                {seq_name}/
                    {seq_name}_gt.txt  # x,y,w,h per line (or comma-separated)
            UAV-benchmark-M/           # Frames
                {seq_name}/
                    img000001.jpg, img000002.jpg, ...
    
    Alternative structure (simpler):
        root/
            sequences/
                {seq_name}/
                    img000001.jpg, ...
            annotations/
                {seq_name}_gt.txt
    
    50 sequences total, typically 30 train / 20 test.
    Contains vehicle tracking from drone perspective — complementary to VisDrone.
    """
    
    def __init__(self, root: str, split: str = 'train', **kwargs):
        super().__init__(**kwargs)
        self.root = Path(root)
        self._load_sequences(split)
    
    def _load_sequences(self, split):
        # Try standard UAVDT structure
        anno_dir = self.root / 'UAV-benchmark-S'
        frame_dir = self.root / 'UAV-benchmark-M'
        
        if not anno_dir.exists():
            # Alternative structure
            anno_dir = self.root / 'annotations'
            frame_dir = self.root / 'sequences'
        
        if not anno_dir.exists():
            # Try root directly having sequence dirs
            anno_dir = self.root
            frame_dir = self.root
        
        if not anno_dir.exists():
            print(f"Warning: UAVDT not found at {self.root}")
            return
        
        # Collect all sequences
        all_seqs = []
        
        # Find annotation files
        gt_files = sorted(anno_dir.rglob('*_gt.txt'))
        if not gt_files:
            gt_files = sorted(anno_dir.rglob('*.txt'))
        
        for gt_file in gt_files:
            seq_name = gt_file.stem.replace('_gt', '')
            
            # Find frames directory
            frames_path = None
            for candidate in [
                frame_dir / seq_name,
                frame_dir / seq_name / 'img',
                self.root / seq_name,
            ]:
                if candidate.exists():
                    frames_path = candidate
                    break
            
            if frames_path is None:
                continue
            
            gt_boxes = []
            with open(gt_file, 'r') as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        gt_boxes.append(None)
                        continue
                    parts = line.replace(',', ' ').replace('\t', ' ').split()
                    try:
                        gt_boxes.append([float(x) for x in parts[:4]])
                    except (ValueError, IndexError):
                        gt_boxes.append(None)
            
            frames = sorted(glob.glob(str(frames_path / '*.jpg')))
            if not frames:
                frames = sorted(glob.glob(str(frames_path / '*.png')))
            
            if len(frames) != len(gt_boxes):
                min_len = min(len(frames), len(gt_boxes))
                frames = frames[:min_len]
                gt_boxes = gt_boxes[:min_len]
            
            if len(frames) >= 2:
                all_seqs.append({'frames': frames, 'gt': gt_boxes, 'name': seq_name})
        
        # Split: first 60% train, last 40% test (standard UAVDT protocol)
        all_seqs.sort(key=lambda x: x['name'])
        split_idx = int(len(all_seqs) * 0.6)
        
        if split == 'train':
            selected = all_seqs[:split_idx]
        else:
            selected = all_seqs[split_idx:]
        
        for seq in selected:
            self.sequences.append({'frames': seq['frames'], 'gt': seq['gt']})
        
        print(f"  Loaded {len(self.sequences)} UAVDT {split} sequences "
              f"(from {len(all_seqs)} total)")


# ============================================================
# WebUAV-3M dataset loader (UAV, large-scale)
# ============================================================

class WebUAV3MDataset(SequenceDataset):
    """WebUAV-3M: million-scale multi-modal UAV tracking dataset.
    
    Structure:
        root/
            {superclass}/                     # e.g., person, vehicle, animal
                {seq_name}/
                    img/
                        000001.jpg, 000002.jpg, ...
                    groundtruth_rect.txt      # x,y,w,h per line
            OR:
            {seq_name}/
                *.jpg
                groundtruth_rect.txt
    
    4,500 sequences, 3.3M frames, 12 superclasses, 223 target classes.
    Average video length: 710 frames (23.7 seconds at 30 FPS).
    
    This is the largest UAV tracking dataset. All sequences are from real
    drone footage. Purpose-built for training deep UAV trackers.
    """
    
    def __init__(self, root: str, split: str = 'train', max_sequences: int = None, **kwargs):
        super().__init__(**kwargs)
        self.root = Path(root)
        self._load_sequences(split, max_sequences)
    
    def _load_sequences(self, split, max_sequences):
        if not self.root.exists():
            print(f"Warning: WebUAV-3M not found at {self.root}")
            return
        
        # Find all sequences recursively
        all_seq_dirs = []
        
        # Look for groundtruth files recursively
        gt_files = sorted(self.root.rglob('groundtruth_rect.txt'))
        if not gt_files:
            gt_files = sorted(self.root.rglob('groundtruth.txt'))
        
        for gt_file in gt_files:
            seq_dir = gt_file.parent
            # Check for img subdirectory or direct frames
            img_dir = seq_dir / 'img'
            if not img_dir.exists():
                img_dir = seq_dir  # frames directly in seq dir
            
            frames = sorted(glob.glob(str(img_dir / '*.jpg')))
            if not frames:
                frames = sorted(glob.glob(str(img_dir / '*.png')))
            
            if len(frames) >= 2:
                all_seq_dirs.append((gt_file, frames))
        
        print(f"WebUAV-3M: found {len(all_seq_dirs)} sequences total")
        
        # Train/test split (80/20)
        split_idx = int(len(all_seq_dirs) * 0.8)
        if split == 'train':
            selected = all_seq_dirs[:split_idx]
        else:
            selected = all_seq_dirs[split_idx:]
        
        # Optionally limit sequences (WebUAV-3M is huge)
        if max_sequences and len(selected) > max_sequences:
            # Sample uniformly to maintain diversity
            step = len(selected) // max_sequences
            selected = selected[::step][:max_sequences]
        
        for gt_file, frames in selected:
            gt_boxes = []
            with open(gt_file, 'r') as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        gt_boxes.append(None)
                        continue
                    parts = line.replace(',', ' ').replace('\t', ' ').split()
                    try:
                        gt_boxes.append([float(x) for x in parts[:4]])
                    except (ValueError, IndexError):
                        gt_boxes.append(None)
            
            if len(frames) != len(gt_boxes):
                min_len = min(len(frames), len(gt_boxes))
                frames = frames[:min_len]
                gt_boxes = gt_boxes[:min_len]
            
            if len(frames) >= 2:
                self.sequences.append({'frames': frames, 'gt': gt_boxes})
        
        print(f"  Loaded {len(self.sequences)} WebUAV-3M {split} sequences")


# ============================================================
# Convenience: build combined dataset
# ============================================================

def build_tracking_dataset(
    data_config: dict,
    template_size: int = 128,
    search_size: int = 256,
    feat_size: int = 16,
    acl_difficulty: float = 0.0,
) -> Dataset:
    """Build a combined tracking dataset from multiple sources.
    
    Standard ground-level datasets provide general tracking capability.
    UAV-specific datasets provide drone-perspective specialization.
    The ACL curriculum bridges the gap: it starts training on easy pairs
    from ground-level data, then progressively incorporates harder pairs
    including UAV sequences with fast motion, small targets, and viewpoint changes.
    
    Args:
        data_config: dict with optional keys:
            Ground-level (standard tracking training data):
            - 'got10k_root': path to GOT-10k dataset
            - 'lasot_root': path to LaSOT dataset
            - 'trackingnet_root': path to TrackingNet dataset
            - 'coco_root': path to COCO train2017 images
            
            UAV-specific (drone perspective — the deployment domain):
            - 'visdrone_root': path to VisDrone-SOT dataset
            - 'uavdt_root': path to UAVDT dataset
            - 'webuav3m_root': path to WebUAV-3M dataset
            - 'webuav3m_max_sequences': limit WebUAV-3M sequences (default: None = all)
            
            Fallback:
            - 'synthetic_length': number of synthetic samples (fallback)
        template_size: template crop size
        search_size: search region crop size
        feat_size: feature map spatial size
        acl_difficulty: initial ACL difficulty
    Returns:
        ConcatDataset or SyntheticTrackingDataset
    """
    common_kwargs = dict(
        template_size=template_size,
        search_size=search_size,
        feat_size=feat_size,
        acl_difficulty=acl_difficulty,
    )
    
    datasets = []
    
    if 'got10k_root' in data_config and os.path.exists(data_config['got10k_root']):
        ds = GOT10kDataset(data_config['got10k_root'], split='train', **common_kwargs)
        if len(ds) > 0:
            datasets.append(ds)
            print(f"GOT-10k: {len(ds)} sequences")
    
    if 'lasot_root' in data_config and os.path.exists(data_config['lasot_root']):
        ds = LaSOTDataset(data_config['lasot_root'], split='train', **common_kwargs)
        if len(ds) > 0:
            datasets.append(ds)
            print(f"LaSOT: {len(ds)} sequences")
    
    if 'trackingnet_root' in data_config and os.path.exists(data_config['trackingnet_root']):
        ds = TrackingNetDataset(data_config['trackingnet_root'], **common_kwargs)
        if len(ds) > 0:
            datasets.append(ds)
            print(f"TrackingNet: {len(ds)} sequences")
    
    if 'coco_root' in data_config and os.path.exists(data_config['coco_root']):
        ds = COCODetDataset(data_config['coco_root'], **common_kwargs)
        if len(ds) > 0:
            datasets.append(ds)
            print(f"COCO: {len(ds)} pseudo-sequences")
    
    # --- UAV-specific datasets (drone perspective) ---
    
    if 'visdrone_root' in data_config and os.path.exists(data_config['visdrone_root']):
        ds = VisDroneSOTDataset(data_config['visdrone_root'], split='train', **common_kwargs)
        if len(ds) > 0:
            datasets.append(ds)
            print(f"VisDrone-SOT: {len(ds)} UAV sequences")
    
    if 'uavdt_root' in data_config and os.path.exists(data_config['uavdt_root']):
        ds = UAVDTDataset(data_config['uavdt_root'], split='train', **common_kwargs)
        if len(ds) > 0:
            datasets.append(ds)
            print(f"UAVDT: {len(ds)} UAV sequences")
    
    if 'webuav3m_root' in data_config and os.path.exists(data_config['webuav3m_root']):
        max_seq = data_config.get('webuav3m_max_sequences', None)
        ds = WebUAV3MDataset(data_config['webuav3m_root'], split='train',
                             max_sequences=max_seq, **common_kwargs)
        if len(ds) > 0:
            datasets.append(ds)
            print(f"WebUAV-3M: {len(ds)} UAV sequences")
    
    if datasets:
        combined = ConcatDataset(datasets)
        print(f"\nTotal training samples: {len(combined)}")
        return combined
    
    # Fallback to synthetic
    syn_len = data_config.get('synthetic_length', 10000)
    print(f"No real data found, using {syn_len} synthetic samples")
    return SyntheticTrackingDataset(
        length=syn_len,
        template_size=template_size,
        search_size=search_size,
        feat_size=feat_size,
        acl_difficulty=acl_difficulty,
    )


# ============================================================
# Legacy alias for backward compatibility
# ============================================================

class TrackingDataset(SyntheticTrackingDataset):
    """Backward-compatible alias for SyntheticTrackingDataset."""
    def __init__(self, data_dir=None, split='train', synthetic=False,
                 synthetic_length=10000, clip_length=3, **kwargs):
        super().__init__(length=synthetic_length, clip_length=clip_length, **kwargs)