File size: 11,313 Bytes
4931970
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
"""
Augmentation Pipeline for Face Detection.

Implements SCRFD's "Sample Redistribution" strategy plus production-grade
robustness augmentations for:
- Tiny faces (large-scale crops generate small face positives)
- Blur (Gaussian, motion blur)
- Compression artifacts (JPEG quality degradation)
- Low-light / poor illumination (brightness/gamma jitter)
- Occlusion (random erasing simulating partial occlusion)

Training augmentation pipeline (from SCRFD + TinaFace papers):
1. Random crop with scale [0.3, 2.0] (Sample Redistribution)
2. Resize to target size (640×640)
3. Photometric distortion (brightness, contrast, hue, saturation)
4. Horizontal flip (p=0.5)
5. Random blur / compression / lighting degradation
6. Normalize (ImageNet stats)
"""

import numpy as np
import cv2
from typing import Dict, Tuple, Optional


class TrainAugmentation:
    """
    Full training augmentation with SCRFD Sample Redistribution.

    The key insight: using crop scales up to 2.0× generates more
    small-face positive anchors at stride 8 (72K → 118K per paper).
    """

    def __init__(self,
                 target_size: int = 640,
                 crop_scales: list = None,
                 mean: tuple = (104.0, 117.0, 123.0),
                 flip_prob: float = 0.5,
                 enable_robustness: bool = True):
        self.target_size = target_size
        self.crop_scales = crop_scales or [0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]
        self.mean = np.array(mean, dtype=np.float32)
        self.flip_prob = flip_prob
        self.enable_robustness = enable_robustness
        self.robustness_aug = RobustnessAugmentation() if enable_robustness else None

    def __call__(self, image: np.ndarray, boxes: np.ndarray,
                 landmarks: np.ndarray) -> Dict:
        h, w = image.shape[:2]

        # 1. Random crop with Sample Redistribution
        image, boxes, landmarks = self._random_crop(image, boxes, landmarks)

        # 2. Resize to target
        image, boxes, landmarks = self._resize(image, boxes, landmarks)

        # 3. Photometric distortion
        image = self._photometric_distort(image)

        # 4. Horizontal flip
        if np.random.random() < self.flip_prob:
            image, boxes, landmarks = self._hflip(image, boxes, landmarks)

        # 5. Robustness augmentations (blur, compression, lighting)
        if self.enable_robustness and self.robustness_aug:
            image = self.robustness_aug(image)

        # 6. Mean subtraction (SCRFD-style normalization)
        image = image.astype(np.float32) - self.mean

        return {'image': image, 'boxes': boxes, 'landmarks': landmarks}

    def _random_crop(self, image: np.ndarray, boxes: np.ndarray,
                     landmarks: np.ndarray) -> Tuple:
        """Random crop with sample redistribution scales."""
        h, w = image.shape[:2]
        scale = np.random.choice(self.crop_scales)
        crop_size = int(min(h, w) * scale)
        crop_size = max(crop_size, 32)

        # If crop is larger than image, pad first
        if crop_size > max(h, w):
            pad_h = max(crop_size - h, 0)
            pad_w = max(crop_size - w, 0)
            image = cv2.copyMakeBorder(image, 0, pad_h, 0, pad_w,
                                        cv2.BORDER_CONSTANT, value=(0, 0, 0))
            h, w = image.shape[:2]

        # Random crop location
        max_x = w - crop_size
        max_y = h - crop_size
        x1 = np.random.randint(0, max(max_x, 1))
        y1 = np.random.randint(0, max(max_y, 1))
        x2 = x1 + crop_size
        y2 = y1 + crop_size

        # Crop image
        cropped = image[y1:y2, x1:x2]

        # Adjust boxes
        new_boxes = boxes.copy()
        new_boxes[:, 0] -= x1
        new_boxes[:, 1] -= y1
        new_boxes[:, 2] -= x1
        new_boxes[:, 3] -= y1

        # Clip to crop boundaries
        new_boxes[:, 0] = np.clip(new_boxes[:, 0], 0, crop_size)
        new_boxes[:, 1] = np.clip(new_boxes[:, 1], 0, crop_size)
        new_boxes[:, 2] = np.clip(new_boxes[:, 2], 0, crop_size)
        new_boxes[:, 3] = np.clip(new_boxes[:, 3], 0, crop_size)

        # Filter valid boxes (at least 20% of original area visible)
        orig_areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
        new_widths = new_boxes[:, 2] - new_boxes[:, 0]
        new_heights = new_boxes[:, 3] - new_boxes[:, 1]
        new_areas = new_widths * new_heights
        valid = (new_widths > 2) & (new_heights > 2) & (new_areas > 0.2 * orig_areas)

        if valid.sum() == 0:
            # Fallback: return original image
            return image[:min(h, w), :min(h, w)], boxes, landmarks

        new_boxes = new_boxes[valid]

        # Adjust landmarks
        new_lmk = landmarks[valid].copy()
        for i in range(5):
            new_lmk[:, i*2] -= x1
            new_lmk[:, i*2+1] -= y1

        return cropped, new_boxes, new_lmk

    def _resize(self, image: np.ndarray, boxes: np.ndarray,
                landmarks: np.ndarray) -> Tuple:
        """Resize to target size."""
        h, w = image.shape[:2]
        scale_x = self.target_size / w
        scale_y = self.target_size / h

        image = cv2.resize(image, (self.target_size, self.target_size))

        boxes[:, 0] *= scale_x
        boxes[:, 1] *= scale_y
        boxes[:, 2] *= scale_x
        boxes[:, 3] *= scale_y

        for i in range(5):
            landmarks[:, i*2] *= scale_x
            landmarks[:, i*2+1] *= scale_y

        return image, boxes, landmarks

    def _photometric_distort(self, image: np.ndarray) -> np.ndarray:
        """Random photometric distortion (brightness, contrast, hue, saturation)."""
        image = image.astype(np.float32)

        # Brightness
        if np.random.random() < 0.5:
            delta = np.random.uniform(-32, 32)
            image += delta

        # Contrast
        if np.random.random() < 0.5:
            alpha = np.random.uniform(0.5, 1.5)
            image *= alpha

        # Color jitter in HSV
        if np.random.random() < 0.5:
            image_uint8 = np.clip(image, 0, 255).astype(np.uint8)
            hsv = cv2.cvtColor(image_uint8, cv2.COLOR_RGB2HSV).astype(np.float32)

            # Hue
            hsv[:, :, 0] += np.random.uniform(-18, 18)
            hsv[:, :, 0] = np.clip(hsv[:, :, 0], 0, 180)

            # Saturation
            hsv[:, :, 1] *= np.random.uniform(0.5, 1.5)
            hsv[:, :, 1] = np.clip(hsv[:, :, 1], 0, 255)

            image = cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2RGB).astype(np.float32)

        return np.clip(image, 0, 255)

    def _hflip(self, image: np.ndarray, boxes: np.ndarray,
               landmarks: np.ndarray) -> Tuple:
        """Horizontal flip with landmark reordering."""
        w = image.shape[1]
        image = image[:, ::-1].copy()

        new_boxes = boxes.copy()
        new_boxes[:, 0] = w - boxes[:, 2]
        new_boxes[:, 2] = w - boxes[:, 0]

        new_lmk = landmarks.copy()
        for i in range(5):
            new_lmk[:, i*2] = w - landmarks[:, i*2]

        # Reorder landmarks for face symmetry:
        # Standard 5-point: left_eye, right_eye, nose, left_mouth, right_mouth
        # After flip: swap left↔right
        if new_lmk.shape[0] > 0 and np.any(new_lmk > 0):
            # Swap left_eye ↔ right_eye
            new_lmk[:, [0, 1, 2, 3]] = new_lmk[:, [2, 3, 0, 1]]
            # Swap left_mouth ↔ right_mouth
            new_lmk[:, [6, 7, 8, 9]] = new_lmk[:, [8, 9, 6, 7]]

        return image, new_boxes, new_lmk


class ValAugmentation:
    """Validation: resize + normalize only."""

    def __init__(self, target_size: int = 640,
                 mean: tuple = (104.0, 117.0, 123.0)):
        self.target_size = target_size
        self.mean = np.array(mean, dtype=np.float32)

    def __call__(self, image: np.ndarray, boxes: np.ndarray,
                 landmarks: np.ndarray) -> Dict:
        h, w = image.shape[:2]

        # Resize keeping aspect ratio
        scale = self.target_size / max(h, w)
        new_h, new_w = int(h * scale), int(w * scale)
        image = cv2.resize(image, (new_w, new_h))

        # Pad to target size
        pad_h = self.target_size - new_h
        pad_w = self.target_size - new_w
        image = cv2.copyMakeBorder(image, 0, pad_h, 0, pad_w,
                                    cv2.BORDER_CONSTANT, value=(0, 0, 0))

        # Scale boxes
        boxes[:, 0] *= scale
        boxes[:, 1] *= scale
        boxes[:, 2] *= scale
        boxes[:, 3] *= scale

        for i in range(5):
            landmarks[:, i*2] *= scale
            landmarks[:, i*2+1] *= scale

        image = image.astype(np.float32) - self.mean

        return {'image': image, 'boxes': boxes, 'landmarks': landmarks}


class RobustnessAugmentation:
    """
    Production-grade robustness augmentations targeting known failure modes.

    Applied with probability during training to make the detector robust to:
    1. Gaussian blur (σ = 0.5–3.0) — camera defocus, motion blur
    2. JPEG compression (Q = 20–80) — streaming/compression artifacts
    3. Low-light gamma (γ = 1.5–3.0) — dark environments
    4. Random occlusion (Cutout) — partial face occlusion
    5. Gaussian noise — sensor noise, low-light grain
    """

    def __init__(self,
                 blur_prob: float = 0.2,
                 jpeg_prob: float = 0.2,
                 lowlight_prob: float = 0.15,
                 occlusion_prob: float = 0.1,
                 noise_prob: float = 0.15):
        self.blur_prob = blur_prob
        self.jpeg_prob = jpeg_prob
        self.lowlight_prob = lowlight_prob
        self.occlusion_prob = occlusion_prob
        self.noise_prob = noise_prob

    def __call__(self, image: np.ndarray) -> np.ndarray:
        # Gaussian blur
        if np.random.random() < self.blur_prob:
            sigma = np.random.uniform(0.5, 3.0)
            ksize = int(sigma * 6) | 1  # Ensure odd
            image = cv2.GaussianBlur(image, (ksize, ksize), sigma)

        # JPEG compression artifacts
        if np.random.random() < self.jpeg_prob:
            quality = np.random.randint(20, 80)
            encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
            _, buf = cv2.imencode('.jpg', image.astype(np.uint8), encode_param)
            image = cv2.imdecode(buf, cv2.IMREAD_COLOR).astype(np.float32)

        # Low-light simulation (gamma darkening)
        if np.random.random() < self.lowlight_prob:
            gamma = np.random.uniform(1.5, 3.0)
            image = np.clip(image, 0, 255)
            image = ((image / 255.0) ** gamma * 255.0)

        # Random occlusion (Cutout)
        if np.random.random() < self.occlusion_prob:
            h, w = image.shape[:2]
            # Random rectangle
            rh = np.random.randint(h // 10, h // 4)
            rw = np.random.randint(w // 10, w // 4)
            ry = np.random.randint(0, h - rh)
            rx = np.random.randint(0, w - rw)
            image[ry:ry+rh, rx:rx+rw] = np.random.randint(0, 255, 3)

        # Gaussian noise
        if np.random.random() < self.noise_prob:
            sigma = np.random.uniform(5, 25)
            noise = np.random.randn(*image.shape) * sigma
            image = np.clip(image + noise, 0, 255)

        return image.astype(np.float32)