File size: 7,618 Bytes

550b1d5

"""
WiderFace Dataset Loader.

WIDER FACE (Yang et al., 2016):
- 32,203 images, 393,703 annotated face bounding boxes
- Split: 40% train (12,880), 10% val (3,226), 50% test (labels not public)
- 3 difficulty levels: Easy, Medium, Hard
- Annotations include: bbox, blur, expression, illumination, occlusion, pose, invalid

Directory structure expected:
    wider_face/
    ├── WIDER_train/
    │   └── images/
    │       ├── 0--Parade/
    │       ├── 1--Handshaking/
    │       └── ...
    ├── WIDER_val/
    │   └── images/
    │       └── ...
    ├── wider_face_split/
    │   ├── wider_face_train_bbx_gt.txt
    │   ├── wider_face_val_bbx_gt.txt
    │   └── ...
    └── retinaface_gt/  (optional, for landmarks)
        ├── train/
        │   └── label.txt
        └── val/
            └── label.txt
"""

import os
import numpy as np
import cv2
from typing import List, Dict, Optional, Tuple, Callable
import torch
from torch.utils.data import Dataset


class WiderFaceDataset(Dataset):
    """
    WIDER FACE dataset with support for:
    - Standard WiderFace bbox annotations
    - RetinaFace-format 5-point landmark annotations
    - Filtering invalid/tiny faces
    - On-the-fly augmentation
    """

    def __init__(self,
                 root_dir: str,
                 split: str = 'train',
                 transform: Optional[Callable] = None,
                 min_face_size: int = 2,
                 use_landmarks: bool = False,
                 annotation_format: str = 'widerface'):
        """
        Args:
            root_dir: Path to wider_face/ directory
            split: 'train' or 'val'
            transform: Augmentation callable
            min_face_size: Minimum face size to keep (pixels)
            use_landmarks: Load 5-point landmarks (requires retinaface_gt/)
            annotation_format: 'widerface' (standard) or 'retinaface' (with landmarks)
        """
        self.root_dir = root_dir
        self.split = split
        self.transform = transform
        self.min_face_size = min_face_size
        self.use_landmarks = use_landmarks

        if annotation_format == 'retinaface' and use_landmarks:
            self.samples = self._load_retinaface_annotations()
        else:
            self.samples = self._load_widerface_annotations()

        print(f"[WiderFace {split}] Loaded {len(self.samples)} images")

    def _load_widerface_annotations(self) -> List[Dict]:
        """Load standard WiderFace bbox annotations."""
        ann_file = os.path.join(
            self.root_dir, 'wider_face_split',
            f'wider_face_{self.split}_bbx_gt.txt'
        )
        img_dir = os.path.join(self.root_dir, f'WIDER_{self.split}', 'images')

        samples = []
        with open(ann_file, 'r') as f:
            while True:
                filename = f.readline().strip()
                if not filename:
                    break

                num_faces = int(f.readline().strip())
                boxes = []
                for _ in range(max(num_faces, 1)):
                    line = f.readline().strip()
                    parts = list(map(float, line.split()))
                    if num_faces == 0:
                        continue  # Skip placeholder line for 0-face images
                    x, y, w, h = parts[0], parts[1], parts[2], parts[3]
                    # Filter tiny/invalid faces
                    if w < self.min_face_size or h < self.min_face_size:
                        continue
                    # Convert to x1, y1, x2, y2
                    boxes.append([x, y, x + w, y + h])

                if boxes:
                    samples.append({
                        'image_path': os.path.join(img_dir, filename),
                        'boxes': np.array(boxes, dtype=np.float32),
                        'filename': filename,
                    })

        return samples

    def _load_retinaface_annotations(self) -> List[Dict]:
        """Load RetinaFace-format annotations with 5-point landmarks."""
        ann_file = os.path.join(
            self.root_dir, 'retinaface_gt', self.split, 'label.txt'
        )
        img_dir = os.path.join(self.root_dir, f'WIDER_{self.split}', 'images')

        samples = []
        current_file = None
        current_boxes = []
        current_lmks = []

        with open(ann_file, 'r') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'):
                    # Save previous image
                    if current_file and current_boxes:
                        samples.append({
                            'image_path': os.path.join(img_dir, current_file),
                            'boxes': np.array(current_boxes, dtype=np.float32),
                            'landmarks': np.array(current_lmks, dtype=np.float32),
                            'filename': current_file,
                        })
                    current_file = line[2:].strip()
                    current_boxes = []
                    current_lmks = []
                else:
                    parts = list(map(float, line.split()))
                    if len(parts) >= 4:
                        x, y, w, h = parts[0], parts[1], parts[2], parts[3]
                        if w < self.min_face_size or h < self.min_face_size:
                            continue
                        current_boxes.append([x, y, x + w, y + h])
                        if len(parts) >= 14:
                            # 5 landmarks: (x1,y1, x2,y2, x3,y3, x4,y4, x5,y5)
                            lmk = parts[4:14]
                            current_lmks.append(lmk)
                        else:
                            current_lmks.append([-1]*10)  # Invalid landmarks

        # Save last image
        if current_file and current_boxes:
            samples.append({
                'image_path': os.path.join(img_dir, current_file),
                'boxes': np.array(current_boxes, dtype=np.float32),
                'landmarks': np.array(current_lmks, dtype=np.float32),
                'filename': current_file,
            })

        return samples

    def __len__(self) -> int:
        return len(self.samples)

    def __getitem__(self, idx: int) -> Dict:
        sample = self.samples[idx]

        # Load image
        img = cv2.imread(sample['image_path'])
        if img is None:
            raise IOError(f"Failed to load image: {sample['image_path']}")
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        boxes = sample['boxes'].copy()
        landmarks = sample.get('landmarks', np.zeros((boxes.shape[0], 10), dtype=np.float32)).copy()

        # Apply augmentation
        if self.transform:
            result = self.transform(img, boxes, landmarks)
            img, boxes, landmarks = result['image'], result['boxes'], result['landmarks']

        # Convert to tensors
        img_tensor = torch.from_numpy(img.transpose(2, 0, 1)).float()
        boxes_tensor = torch.from_numpy(boxes).float()

        target = {
            'boxes': boxes_tensor,
            'labels': torch.ones(boxes_tensor.shape[0], dtype=torch.long),
        }
        if self.use_landmarks:
            target['landmarks'] = torch.from_numpy(landmarks).float()

        return img_tensor, target

    @staticmethod
    def collate_fn(batch):
        """Custom collate for variable-length targets."""
        images = torch.stack([item[0] for item in batch])
        targets = [item[1] for item in batch]
        return images, targets