omar-ah
/

vil-tracker

Model card Files Files and versions

xet

Community

omar-ah commited on 9 days ago

Commit

1322185

verified ·

1 Parent(s): 51d2470

Add UAV dataset loaders: VisDrone-SOT, UAVDT, WebUAV-3M

Browse files

Files changed (1) hide show

vil_tracker/data/dataset.py +350 -0

vil_tracker/data/dataset.py CHANGED Viewed

@@ -768,6 +768,319 @@ class SyntheticTrackingDataset(Dataset):
         self.acl_difficulty = min(1.0, max(0.0, difficulty))
 # ============================================================
 # Convenience: build combined dataset
 # ============================================================
@@ -781,12 +1094,27 @@ def build_tracking_dataset(
 ) -> Dataset:
     """Build a combined tracking dataset from multiple sources.
     Args:
         data_config: dict with optional keys:
             - 'got10k_root': path to GOT-10k dataset
             - 'lasot_root': path to LaSOT dataset
             - 'trackingnet_root': path to TrackingNet dataset
             - 'coco_root': path to COCO train2017 images
             - 'synthetic_length': number of synthetic samples (fallback)
         template_size: template crop size
         search_size: search region crop size
@@ -828,6 +1156,28 @@ def build_tracking_dataset(
             datasets.append(ds)
             print(f"COCO: {len(ds)} pseudo-sequences")
     if datasets:
         combined = ConcatDataset(datasets)
         print(f"\nTotal training samples: {len(combined)}")

         self.acl_difficulty = min(1.0, max(0.0, difficulty))
+# ============================================================
+# VisDrone-SOT dataset loader (UAV)
+# ============================================================
+class VisDroneSOTDataset(SequenceDataset):
+    """VisDrone-SOT single object tracking dataset (drone/UAV perspective).
+    Structure:
+        root/
+            VisDrone2019-SOT-train/
+                sequences/
+                    uav0000001_00000_s/
+                        0000001.jpg, 0000002.jpg, ...
+                    ...
+                annotations/
+                    uav0000001_00000_s.txt    # x,y,w,h per line
+                    ...
+    Splits: train (86 sequences, ~70K frames), val (11 sequences),
+            test-dev (35 sequences), test-challenge (35 sequences)
+    Key for our tracker: real drone footage with small targets, fast motion,
+    viewpoint changes, and camera ego-motion — the exact conditions we deploy in.
+    """
+    def __init__(self, root: str, split: str = 'train', **kwargs):
+        super().__init__(**kwargs)
+        self.root = Path(root)
+        self._load_sequences(split)
+    def _load_sequences(self, split):
+        # Try multiple directory naming conventions
+        split_names = {
+            'train': ['VisDrone2019-SOT-train', 'VisDrone2018-SOT-train', 'train'],
+            'val': ['VisDrone2019-SOT-val', 'VisDrone2018-SOT-val', 'val'],
+            'test': ['VisDrone2019-SOT-test-dev', 'VisDrone2018-SOT-test', 'test-dev', 'test'],
+        }
+        split_dir = None
+        for name in split_names.get(split, [split]):
+            candidate = self.root / name
+            if candidate.exists():
+                split_dir = candidate
+                break
+            # Also check if root itself is the split dir
+            if (self.root / 'sequences').exists():
+                split_dir = self.root
+                break
+        if split_dir is None:
+            print(f"Warning: VisDrone-SOT {split} not found at {self.root}")
+            return
+        seq_dir = split_dir / 'sequences'
+        anno_dir = split_dir / 'annotations'
+        if not seq_dir.exists() or not anno_dir.exists():
+            print(f"Warning: VisDrone-SOT missing sequences/ or annotations/ at {split_dir}")
+            return
+        total_seqs = 0
+        for anno_file in sorted(anno_dir.glob('*.txt')):
+            seq_name = anno_file.stem
+            frames_dir = seq_dir / seq_name
+            if not frames_dir.exists():
+                continue
+            gt_boxes = []
+            with open(anno_file, 'r') as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        gt_boxes.append(None)
+                        continue
+                    parts = line.replace(',', ' ').split()
+                    try:
+                        gt_boxes.append([float(x) for x in parts[:4]])
+                    except ValueError:
+                        gt_boxes.append(None)
+            frames = sorted(glob.glob(str(frames_dir / '*.jpg')))
+            if not frames:
+                frames = sorted(glob.glob(str(frames_dir / '*.png')))
+            if len(frames) != len(gt_boxes):
+                min_len = min(len(frames), len(gt_boxes))
+                frames = frames[:min_len]
+                gt_boxes = gt_boxes[:min_len]
+            if len(frames) >= 2:
+                self.sequences.append({'frames': frames, 'gt': gt_boxes})
+                total_seqs += 1
+        print(f"  Loaded {total_seqs} VisDrone-SOT {split} sequences")
+# ============================================================
+# UAVDT dataset loader (UAV)
+# ============================================================
+class UAVDTDataset(SequenceDataset):
+    """UAVDT (Unmanned Aerial Vehicle Detection and Tracking) dataset.
+    Structure:
+        root/
+            UAV-benchmark-S/           # SOT annotations
+                {seq_name}/
+                    {seq_name}_gt.txt  # x,y,w,h per line (or comma-separated)
+            UAV-benchmark-M/           # Frames
+                {seq_name}/
+                    img000001.jpg, img000002.jpg, ...
+    Alternative structure (simpler):
+        root/
+            sequences/
+                {seq_name}/
+                    img000001.jpg, ...
+            annotations/
+                {seq_name}_gt.txt
+    50 sequences total, typically 30 train / 20 test.
+    Contains vehicle tracking from drone perspective — complementary to VisDrone.
+    """
+    def __init__(self, root: str, split: str = 'train', **kwargs):
+        super().__init__(**kwargs)
+        self.root = Path(root)
+        self._load_sequences(split)
+    def _load_sequences(self, split):
+        # Try standard UAVDT structure
+        anno_dir = self.root / 'UAV-benchmark-S'
+        frame_dir = self.root / 'UAV-benchmark-M'
+        if not anno_dir.exists():
+            # Alternative structure
+            anno_dir = self.root / 'annotations'
+            frame_dir = self.root / 'sequences'
+        if not anno_dir.exists():
+            # Try root directly having sequence dirs
+            anno_dir = self.root
+            frame_dir = self.root
+        if not anno_dir.exists():
+            print(f"Warning: UAVDT not found at {self.root}")
+            return
+        # Collect all sequences
+        all_seqs = []
+        # Find annotation files
+        gt_files = sorted(anno_dir.rglob('*_gt.txt'))
+        if not gt_files:
+            gt_files = sorted(anno_dir.rglob('*.txt'))
+        for gt_file in gt_files:
+            seq_name = gt_file.stem.replace('_gt', '')
+            # Find frames directory
+            frames_path = None
+            for candidate in [
+                frame_dir / seq_name,
+                frame_dir / seq_name / 'img',
+                self.root / seq_name,
+            ]:
+                if candidate.exists():
+                    frames_path = candidate
+                    break
+            if frames_path is None:
+                continue
+            gt_boxes = []
+            with open(gt_file, 'r') as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        gt_boxes.append(None)
+                        continue
+                    parts = line.replace(',', ' ').replace('\t', ' ').split()
+                    try:
+                        gt_boxes.append([float(x) for x in parts[:4]])
+                    except (ValueError, IndexError):
+                        gt_boxes.append(None)
+            frames = sorted(glob.glob(str(frames_path / '*.jpg')))
+            if not frames:
+                frames = sorted(glob.glob(str(frames_path / '*.png')))
+            if len(frames) != len(gt_boxes):
+                min_len = min(len(frames), len(gt_boxes))
+                frames = frames[:min_len]
+                gt_boxes = gt_boxes[:min_len]
+            if len(frames) >= 2:
+                all_seqs.append({'frames': frames, 'gt': gt_boxes, 'name': seq_name})
+        # Split: first 60% train, last 40% test (standard UAVDT protocol)
+        all_seqs.sort(key=lambda x: x['name'])
+        split_idx = int(len(all_seqs) * 0.6)
+        if split == 'train':
+            selected = all_seqs[:split_idx]
+        else:
+            selected = all_seqs[split_idx:]
+        for seq in selected:
+            self.sequences.append({'frames': seq['frames'], 'gt': seq['gt']})
+        print(f"  Loaded {len(self.sequences)} UAVDT {split} sequences "
+              f"(from {len(all_seqs)} total)")
+# ============================================================
+# WebUAV-3M dataset loader (UAV, large-scale)
+# ============================================================
+class WebUAV3MDataset(SequenceDataset):
+    """WebUAV-3M: million-scale multi-modal UAV tracking dataset.
+    Structure:
+        root/
+            {superclass}/                     # e.g., person, vehicle, animal
+                {seq_name}/
+                    img/
+                        000001.jpg, 000002.jpg, ...
+                    groundtruth_rect.txt      # x,y,w,h per line
+            OR:
+            {seq_name}/
+                *.jpg
+                groundtruth_rect.txt
+    4,500 sequences, 3.3M frames, 12 superclasses, 223 target classes.
+    Average video length: 710 frames (23.7 seconds at 30 FPS).
+    This is the largest UAV tracking dataset. All sequences are from real
+    drone footage. Purpose-built for training deep UAV trackers.
+    """
+    def __init__(self, root: str, split: str = 'train', max_sequences: int = None, **kwargs):
+        super().__init__(**kwargs)
+        self.root = Path(root)
+        self._load_sequences(split, max_sequences)
+    def _load_sequences(self, split, max_sequences):
+        if not self.root.exists():
+            print(f"Warning: WebUAV-3M not found at {self.root}")
+            return
+        # Find all sequences recursively
+        all_seq_dirs = []
+        # Look for groundtruth files recursively
+        gt_files = sorted(self.root.rglob('groundtruth_rect.txt'))
+        if not gt_files:
+            gt_files = sorted(self.root.rglob('groundtruth.txt'))
+        for gt_file in gt_files:
+            seq_dir = gt_file.parent
+            # Check for img subdirectory or direct frames
+            img_dir = seq_dir / 'img'
+            if not img_dir.exists():
+                img_dir = seq_dir  # frames directly in seq dir
+            frames = sorted(glob.glob(str(img_dir / '*.jpg')))
+            if not frames:
+                frames = sorted(glob.glob(str(img_dir / '*.png')))
+            if len(frames) >= 2:
+                all_seq_dirs.append((gt_file, frames))
+        print(f"WebUAV-3M: found {len(all_seq_dirs)} sequences total")
+        # Train/test split (80/20)
+        split_idx = int(len(all_seq_dirs) * 0.8)
+        if split == 'train':
+            selected = all_seq_dirs[:split_idx]
+        else:
+            selected = all_seq_dirs[split_idx:]
+        # Optionally limit sequences (WebUAV-3M is huge)
+        if max_sequences and len(selected) > max_sequences:
+            # Sample uniformly to maintain diversity
+            step = len(selected) // max_sequences
+            selected = selected[::step][:max_sequences]
+        for gt_file, frames in selected:
+            gt_boxes = []
+            with open(gt_file, 'r') as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        gt_boxes.append(None)
+                        continue
+                    parts = line.replace(',', ' ').replace('\t', ' ').split()
+                    try:
+                        gt_boxes.append([float(x) for x in parts[:4]])
+                    except (ValueError, IndexError):
+                        gt_boxes.append(None)
+            if len(frames) != len(gt_boxes):
+                min_len = min(len(frames), len(gt_boxes))
+                frames = frames[:min_len]
+                gt_boxes = gt_boxes[:min_len]
+            if len(frames) >= 2:
+                self.sequences.append({'frames': frames, 'gt': gt_boxes})
+        print(f"  Loaded {len(self.sequences)} WebUAV-3M {split} sequences")
 # ============================================================
 # Convenience: build combined dataset
 # ============================================================
 ) -> Dataset:
     """Build a combined tracking dataset from multiple sources.
+    Standard ground-level datasets provide general tracking capability.
+    UAV-specific datasets provide drone-perspective specialization.
+    The ACL curriculum bridges the gap: it starts training on easy pairs
+    from ground-level data, then progressively incorporates harder pairs
+    including UAV sequences with fast motion, small targets, and viewpoint changes.
     Args:
         data_config: dict with optional keys:
+            Ground-level (standard tracking training data):
             - 'got10k_root': path to GOT-10k dataset
             - 'lasot_root': path to LaSOT dataset
             - 'trackingnet_root': path to TrackingNet dataset
             - 'coco_root': path to COCO train2017 images
+            UAV-specific (drone perspective — the deployment domain):
+            - 'visdrone_root': path to VisDrone-SOT dataset
+            - 'uavdt_root': path to UAVDT dataset
+            - 'webuav3m_root': path to WebUAV-3M dataset
+            - 'webuav3m_max_sequences': limit WebUAV-3M sequences (default: None = all)
+            Fallback:
             - 'synthetic_length': number of synthetic samples (fallback)
         template_size: template crop size
         search_size: search region crop size
             datasets.append(ds)
             print(f"COCO: {len(ds)} pseudo-sequences")
+    # --- UAV-specific datasets (drone perspective) ---
+    if 'visdrone_root' in data_config and os.path.exists(data_config['visdrone_root']):
+        ds = VisDroneSOTDataset(data_config['visdrone_root'], split='train', **common_kwargs)
+        if len(ds) > 0:
+            datasets.append(ds)
+            print(f"VisDrone-SOT: {len(ds)} UAV sequences")
+    if 'uavdt_root' in data_config and os.path.exists(data_config['uavdt_root']):
+        ds = UAVDTDataset(data_config['uavdt_root'], split='train', **common_kwargs)
+        if len(ds) > 0:
+            datasets.append(ds)
+            print(f"UAVDT: {len(ds)} UAV sequences")
+    if 'webuav3m_root' in data_config and os.path.exists(data_config['webuav3m_root']):
+        max_seq = data_config.get('webuav3m_max_sequences', None)
+        ds = WebUAV3MDataset(data_config['webuav3m_root'], split='train',
+                             max_sequences=max_seq, **common_kwargs)
+        if len(ds) > 0:
+            datasets.append(ds)
+            print(f"WebUAV-3M: {len(ds)} UAV sequences")
     if datasets:
         combined = ConcatDataset(datasets)
         print(f"\nTotal training samples: {len(combined)}")