File size: 11,313 Bytes
4931970 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 | """
Augmentation Pipeline for Face Detection.
Implements SCRFD's "Sample Redistribution" strategy plus production-grade
robustness augmentations for:
- Tiny faces (large-scale crops generate small face positives)
- Blur (Gaussian, motion blur)
- Compression artifacts (JPEG quality degradation)
- Low-light / poor illumination (brightness/gamma jitter)
- Occlusion (random erasing simulating partial occlusion)
Training augmentation pipeline (from SCRFD + TinaFace papers):
1. Random crop with scale [0.3, 2.0] (Sample Redistribution)
2. Resize to target size (640×640)
3. Photometric distortion (brightness, contrast, hue, saturation)
4. Horizontal flip (p=0.5)
5. Random blur / compression / lighting degradation
6. Normalize (ImageNet stats)
"""
import numpy as np
import cv2
from typing import Dict, Tuple, Optional
class TrainAugmentation:
"""
Full training augmentation with SCRFD Sample Redistribution.
The key insight: using crop scales up to 2.0× generates more
small-face positive anchors at stride 8 (72K → 118K per paper).
"""
def __init__(self,
target_size: int = 640,
crop_scales: list = None,
mean: tuple = (104.0, 117.0, 123.0),
flip_prob: float = 0.5,
enable_robustness: bool = True):
self.target_size = target_size
self.crop_scales = crop_scales or [0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]
self.mean = np.array(mean, dtype=np.float32)
self.flip_prob = flip_prob
self.enable_robustness = enable_robustness
self.robustness_aug = RobustnessAugmentation() if enable_robustness else None
def __call__(self, image: np.ndarray, boxes: np.ndarray,
landmarks: np.ndarray) -> Dict:
h, w = image.shape[:2]
# 1. Random crop with Sample Redistribution
image, boxes, landmarks = self._random_crop(image, boxes, landmarks)
# 2. Resize to target
image, boxes, landmarks = self._resize(image, boxes, landmarks)
# 3. Photometric distortion
image = self._photometric_distort(image)
# 4. Horizontal flip
if np.random.random() < self.flip_prob:
image, boxes, landmarks = self._hflip(image, boxes, landmarks)
# 5. Robustness augmentations (blur, compression, lighting)
if self.enable_robustness and self.robustness_aug:
image = self.robustness_aug(image)
# 6. Mean subtraction (SCRFD-style normalization)
image = image.astype(np.float32) - self.mean
return {'image': image, 'boxes': boxes, 'landmarks': landmarks}
def _random_crop(self, image: np.ndarray, boxes: np.ndarray,
landmarks: np.ndarray) -> Tuple:
"""Random crop with sample redistribution scales."""
h, w = image.shape[:2]
scale = np.random.choice(self.crop_scales)
crop_size = int(min(h, w) * scale)
crop_size = max(crop_size, 32)
# If crop is larger than image, pad first
if crop_size > max(h, w):
pad_h = max(crop_size - h, 0)
pad_w = max(crop_size - w, 0)
image = cv2.copyMakeBorder(image, 0, pad_h, 0, pad_w,
cv2.BORDER_CONSTANT, value=(0, 0, 0))
h, w = image.shape[:2]
# Random crop location
max_x = w - crop_size
max_y = h - crop_size
x1 = np.random.randint(0, max(max_x, 1))
y1 = np.random.randint(0, max(max_y, 1))
x2 = x1 + crop_size
y2 = y1 + crop_size
# Crop image
cropped = image[y1:y2, x1:x2]
# Adjust boxes
new_boxes = boxes.copy()
new_boxes[:, 0] -= x1
new_boxes[:, 1] -= y1
new_boxes[:, 2] -= x1
new_boxes[:, 3] -= y1
# Clip to crop boundaries
new_boxes[:, 0] = np.clip(new_boxes[:, 0], 0, crop_size)
new_boxes[:, 1] = np.clip(new_boxes[:, 1], 0, crop_size)
new_boxes[:, 2] = np.clip(new_boxes[:, 2], 0, crop_size)
new_boxes[:, 3] = np.clip(new_boxes[:, 3], 0, crop_size)
# Filter valid boxes (at least 20% of original area visible)
orig_areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
new_widths = new_boxes[:, 2] - new_boxes[:, 0]
new_heights = new_boxes[:, 3] - new_boxes[:, 1]
new_areas = new_widths * new_heights
valid = (new_widths > 2) & (new_heights > 2) & (new_areas > 0.2 * orig_areas)
if valid.sum() == 0:
# Fallback: return original image
return image[:min(h, w), :min(h, w)], boxes, landmarks
new_boxes = new_boxes[valid]
# Adjust landmarks
new_lmk = landmarks[valid].copy()
for i in range(5):
new_lmk[:, i*2] -= x1
new_lmk[:, i*2+1] -= y1
return cropped, new_boxes, new_lmk
def _resize(self, image: np.ndarray, boxes: np.ndarray,
landmarks: np.ndarray) -> Tuple:
"""Resize to target size."""
h, w = image.shape[:2]
scale_x = self.target_size / w
scale_y = self.target_size / h
image = cv2.resize(image, (self.target_size, self.target_size))
boxes[:, 0] *= scale_x
boxes[:, 1] *= scale_y
boxes[:, 2] *= scale_x
boxes[:, 3] *= scale_y
for i in range(5):
landmarks[:, i*2] *= scale_x
landmarks[:, i*2+1] *= scale_y
return image, boxes, landmarks
def _photometric_distort(self, image: np.ndarray) -> np.ndarray:
"""Random photometric distortion (brightness, contrast, hue, saturation)."""
image = image.astype(np.float32)
# Brightness
if np.random.random() < 0.5:
delta = np.random.uniform(-32, 32)
image += delta
# Contrast
if np.random.random() < 0.5:
alpha = np.random.uniform(0.5, 1.5)
image *= alpha
# Color jitter in HSV
if np.random.random() < 0.5:
image_uint8 = np.clip(image, 0, 255).astype(np.uint8)
hsv = cv2.cvtColor(image_uint8, cv2.COLOR_RGB2HSV).astype(np.float32)
# Hue
hsv[:, :, 0] += np.random.uniform(-18, 18)
hsv[:, :, 0] = np.clip(hsv[:, :, 0], 0, 180)
# Saturation
hsv[:, :, 1] *= np.random.uniform(0.5, 1.5)
hsv[:, :, 1] = np.clip(hsv[:, :, 1], 0, 255)
image = cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2RGB).astype(np.float32)
return np.clip(image, 0, 255)
def _hflip(self, image: np.ndarray, boxes: np.ndarray,
landmarks: np.ndarray) -> Tuple:
"""Horizontal flip with landmark reordering."""
w = image.shape[1]
image = image[:, ::-1].copy()
new_boxes = boxes.copy()
new_boxes[:, 0] = w - boxes[:, 2]
new_boxes[:, 2] = w - boxes[:, 0]
new_lmk = landmarks.copy()
for i in range(5):
new_lmk[:, i*2] = w - landmarks[:, i*2]
# Reorder landmarks for face symmetry:
# Standard 5-point: left_eye, right_eye, nose, left_mouth, right_mouth
# After flip: swap left↔right
if new_lmk.shape[0] > 0 and np.any(new_lmk > 0):
# Swap left_eye ↔ right_eye
new_lmk[:, [0, 1, 2, 3]] = new_lmk[:, [2, 3, 0, 1]]
# Swap left_mouth ↔ right_mouth
new_lmk[:, [6, 7, 8, 9]] = new_lmk[:, [8, 9, 6, 7]]
return image, new_boxes, new_lmk
class ValAugmentation:
"""Validation: resize + normalize only."""
def __init__(self, target_size: int = 640,
mean: tuple = (104.0, 117.0, 123.0)):
self.target_size = target_size
self.mean = np.array(mean, dtype=np.float32)
def __call__(self, image: np.ndarray, boxes: np.ndarray,
landmarks: np.ndarray) -> Dict:
h, w = image.shape[:2]
# Resize keeping aspect ratio
scale = self.target_size / max(h, w)
new_h, new_w = int(h * scale), int(w * scale)
image = cv2.resize(image, (new_w, new_h))
# Pad to target size
pad_h = self.target_size - new_h
pad_w = self.target_size - new_w
image = cv2.copyMakeBorder(image, 0, pad_h, 0, pad_w,
cv2.BORDER_CONSTANT, value=(0, 0, 0))
# Scale boxes
boxes[:, 0] *= scale
boxes[:, 1] *= scale
boxes[:, 2] *= scale
boxes[:, 3] *= scale
for i in range(5):
landmarks[:, i*2] *= scale
landmarks[:, i*2+1] *= scale
image = image.astype(np.float32) - self.mean
return {'image': image, 'boxes': boxes, 'landmarks': landmarks}
class RobustnessAugmentation:
"""
Production-grade robustness augmentations targeting known failure modes.
Applied with probability during training to make the detector robust to:
1. Gaussian blur (σ = 0.5–3.0) — camera defocus, motion blur
2. JPEG compression (Q = 20–80) — streaming/compression artifacts
3. Low-light gamma (γ = 1.5–3.0) — dark environments
4. Random occlusion (Cutout) — partial face occlusion
5. Gaussian noise — sensor noise, low-light grain
"""
def __init__(self,
blur_prob: float = 0.2,
jpeg_prob: float = 0.2,
lowlight_prob: float = 0.15,
occlusion_prob: float = 0.1,
noise_prob: float = 0.15):
self.blur_prob = blur_prob
self.jpeg_prob = jpeg_prob
self.lowlight_prob = lowlight_prob
self.occlusion_prob = occlusion_prob
self.noise_prob = noise_prob
def __call__(self, image: np.ndarray) -> np.ndarray:
# Gaussian blur
if np.random.random() < self.blur_prob:
sigma = np.random.uniform(0.5, 3.0)
ksize = int(sigma * 6) | 1 # Ensure odd
image = cv2.GaussianBlur(image, (ksize, ksize), sigma)
# JPEG compression artifacts
if np.random.random() < self.jpeg_prob:
quality = np.random.randint(20, 80)
encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
_, buf = cv2.imencode('.jpg', image.astype(np.uint8), encode_param)
image = cv2.imdecode(buf, cv2.IMREAD_COLOR).astype(np.float32)
# Low-light simulation (gamma darkening)
if np.random.random() < self.lowlight_prob:
gamma = np.random.uniform(1.5, 3.0)
image = np.clip(image, 0, 255)
image = ((image / 255.0) ** gamma * 255.0)
# Random occlusion (Cutout)
if np.random.random() < self.occlusion_prob:
h, w = image.shape[:2]
# Random rectangle
rh = np.random.randint(h // 10, h // 4)
rw = np.random.randint(w // 10, w // 4)
ry = np.random.randint(0, h - rh)
rx = np.random.randint(0, w - rw)
image[ry:ry+rh, rx:rx+rw] = np.random.randint(0, 255, 3)
# Gaussian noise
if np.random.random() < self.noise_prob:
sigma = np.random.uniform(5, 25)
noise = np.random.randn(*image.shape) * sigma
image = np.clip(image + noise, 0, 255)
return image.astype(np.float32)
|