cledouxluma
/

facedet

Model card Files Files and versions

xet

Community

cledouxluma commited on 15 days ago

Commit

6953619

verified ·

1 Parent(s): afaa2cf

Upload models/detector.py with huggingface_hub

Browse files

Files changed (1) hide show

models/detector.py +419 -0

models/detector.py ADDED Viewed

	@@ -0,0 +1,419 @@

+"""
+SCRFD Full Detector — Backbone + Neck + Head + Loss + Post-processing.
+This is the main model class that ties together all components and provides:
+1. Training forward: returns losses dict
+2. Inference forward: returns detections (boxes, scores, landmarks)
+3. ONNX-exportable inference path
+Model configurations (WiderFace Hard val / GFLOPs / FPS @VGA on V100):
+- SCRFD-34GF:  85.2% / 34 GF / ~80 FPS   (flagship quality)
+- SCRFD-10GF:  83.1% / 10 GF / ~140 FPS  (balanced)
+- SCRFD-2.5GF: 77.9% / 2.5 GF / ~400 FPS (real-time)
+- SCRFD-0.5GF: 68.5% / 0.5 GF / ~1000 FPS (mobile/edge)
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List, Tuple, Dict, Optional
+import math
+from .backbone import SCRFDBackbone, build_backbone
+from .neck import PAFPN, build_neck
+from .head import SCRFDHead, build_head
+from .anchor import AnchorGenerator, ATSSAssigner
+from .losses import GFocalLoss, DIoULoss, FocalLoss, LandmarkLoss
+class SCRFD(nn.Module):
+    """
+    Sample and Computation Redistribution Face Detector.
+    Complete pipeline: backbone → PAFPN → shared head → anchors → losses/NMS
+    """
+    def __init__(self,
+                 backbone: SCRFDBackbone,
+                 neck: PAFPN,
+                 head: SCRFDHead,
+                 anchor_generator: AnchorGenerator,
+                 assigner: ATSSAssigner,
+                 strides: List[int] = [8, 16, 32],
+                 score_threshold: float = 0.3,
+                 nms_threshold: float = 0.4,
+                 max_detections: int = 750,
+                 use_gfl: bool = True,
+                 cls_weight: float = 1.0,
+                 reg_weight: float = 2.0,
+                 lmk_weight: float = 0.1):
+        super().__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.anchor_gen = anchor_generator
+        self.assigner = assigner
+        self.strides = strides
+        self.score_threshold = score_threshold
+        self.nms_threshold = nms_threshold
+        self.max_detections = max_detections
+        self.use_gfl = use_gfl
+        # Loss functions
+        self.cls_loss_fn = GFocalLoss(beta=2.0) if use_gfl else FocalLoss()
+        self.reg_loss_fn = DIoULoss()
+        self.lmk_loss_fn = LandmarkLoss() if head.use_landmarks else None
+        # Loss weights
+        self.cls_weight = cls_weight
+        self.reg_weight = reg_weight
+        self.lmk_weight = lmk_weight
+    def forward(self, images: torch.Tensor,
+                targets: Optional[List[Dict]] = None) -> Dict:
+        """
+        Args:
+            images: [B, 3, H, W] batch of images (normalized)
+            targets: List of dicts with keys:
+                'boxes': [M, 4] face boxes (x1, y1, x2, y2)
+                'labels': [M] labels (all 1)
+                'landmarks': [M, 10] optional landmarks
+                When None, runs inference.
+        Returns:
+            Training: dict of losses
+            Inference: list of dicts with 'boxes', 'scores', 'landmarks'
+        """
+        # Feature extraction
+        features = self.backbone(images)
+        features = self.neck(features)
+        head_out = self.head(features)
+        # Generate anchors
+        feat_sizes = [(f.shape[2], f.shape[3]) for f in features]
+        anchors_per_level = self.anchor_gen.grid_anchors(feat_sizes, images.device)
+        num_anchors_per_level = [a.shape[0] for a in anchors_per_level]
+        if targets is not None:
+            return self._compute_loss(head_out, anchors_per_level,
+                                      num_anchors_per_level, targets, images.shape)
+        else:
+            return self._inference(head_out, anchors_per_level, images.shape)
+    def _compute_loss(self, head_out: Dict, anchors_per_level: List[torch.Tensor],
+                      num_per_level: List[int], targets: List[Dict],
+                      img_shape: Tuple) -> Dict:
+        """Compute training losses."""
+        device = anchors_per_level[0].device
+        batch_size = len(targets)
+        # Flatten predictions across levels
+        all_cls = []
+        all_reg = []
+        all_lmk = []
+        for i in range(len(self.strides)):
+            B, _, H, W = head_out['cls_scores'][i].shape
+            cls = head_out['cls_scores'][i].permute(0, 2, 3, 1).reshape(B, -1, 1)
+            reg = head_out['bbox_preds'][i].permute(0, 2, 3, 1).reshape(B, -1, 4)
+            all_cls.append(cls)
+            all_reg.append(reg)
+            if self.head.use_landmarks and 'lmk_preds' in head_out:
+                lmk = head_out['lmk_preds'][i].permute(0, 2, 3, 1).reshape(B, -1, 10)
+                all_lmk.append(lmk)
+        all_cls = torch.cat(all_cls, dim=1)  # [B, N, 1]
+        all_reg = torch.cat(all_reg, dim=1)  # [B, N, 4]
+        all_anchors = torch.cat(anchors_per_level, dim=0)  # [N, 4]
+        has_lmk = len(all_lmk) > 0
+        if has_lmk:
+            all_lmk = torch.cat(all_lmk, dim=1)
+        total_cls_loss = torch.tensor(0.0, device=device)
+        total_reg_loss = torch.tensor(0.0, device=device)
+        total_lmk_loss = torch.tensor(0.0, device=device)
+        num_pos = 0
+        for b in range(batch_size):
+            gt_boxes = targets[b]['boxes']
+            gt_labels = targets[b].get('labels',
+                                       torch.ones(gt_boxes.shape[0], dtype=torch.long, device=device))
+            # ATSS matching
+            assigned_labels, assigned_gt_inds = self.assigner.assign(
+                all_anchors, gt_boxes, gt_labels, num_per_level
+            )
+            pos_mask = assigned_labels > 0
+            num_pos += pos_mask.sum().item()
+            # Classification loss (all anchors)
+            if self.use_gfl:
+                # GFL: positive target = IoU, negative target = 0
+                cls_targets = torch.zeros(all_anchors.shape[0], device=device)
+                if pos_mask.any():
+                    pos_anchors = all_anchors[pos_mask]
+                    pos_gt = gt_boxes[assigned_gt_inds[pos_mask]]
+                    pos_ious = self._compute_iou_single(pos_anchors, pos_gt)
+                    cls_targets[pos_mask] = pos_ious
+                total_cls_loss += self.cls_loss_fn(
+                    all_cls[b].squeeze(-1), cls_targets
+                )
+            else:
+                total_cls_loss += self.cls_loss_fn(
+                    all_cls[b].squeeze(-1), (assigned_labels > 0).float()
+                )
+            # Box regression loss (positive anchors only)
+            if pos_mask.any():
+                pos_reg = all_reg[b][pos_mask]
+                pos_anchors = all_anchors[pos_mask]
+                pos_gt = gt_boxes[assigned_gt_inds[pos_mask]]
+                # Decode predictions to absolute boxes
+                pred_boxes = self._decode_boxes(pos_anchors, pos_reg)
+                total_reg_loss += self.reg_loss_fn(pred_boxes, pos_gt)
+                # Landmark loss
+                if self.head.use_landmarks and 'landmarks' in targets[b] and has_lmk:
+                    gt_lmk = targets[b]['landmarks']
+                    pos_lmk_pred = all_lmk[b][pos_mask]
+                    pos_lmk_gt = gt_lmk[assigned_gt_inds[pos_mask]]
+                    # Decode landmarks relative to anchors
+                    pred_lmk = self._decode_landmarks(pos_anchors, pos_lmk_pred)
+                    total_lmk_loss += self.lmk_loss_fn(pred_lmk, pos_lmk_gt)
+        num_pos = max(num_pos, 1)
+        losses = {
+            'cls_loss': self.cls_weight * total_cls_loss / batch_size,
+            'reg_loss': self.reg_weight * total_reg_loss / batch_size,
+        }
+        if self.head.use_landmarks:
+            losses['lmk_loss'] = self.lmk_weight * total_lmk_loss / batch_size
+        losses['total_loss'] = sum(losses.values())
+        losses['num_pos'] = torch.tensor(num_pos, dtype=torch.float, device=device)
+        return losses
+    def _inference(self, head_out: Dict, anchors_per_level: List[torch.Tensor],
+                   img_shape: Tuple) -> List[Dict]:
+        """Run inference with NMS."""
+        batch_size = head_out['cls_scores'][0].shape[0]
+        device = head_out['cls_scores'][0].device
+        results = []
+        for b in range(batch_size):
+            all_boxes = []
+            all_scores = []
+            all_lmk = []
+            for i in range(len(self.strides)):
+                cls = head_out['cls_scores'][i][b].permute(1, 2, 0).reshape(-1, 1).sigmoid()
+                reg = head_out['bbox_preds'][i][b].permute(1, 2, 0).reshape(-1, 4)
+                anchors = anchors_per_level[i]
+                # Filter by score threshold
+                scores = cls.squeeze(-1)
+                keep = scores > self.score_threshold
+                if keep.sum() == 0:
+                    continue
+                scores = scores[keep]
+                reg = reg[keep]
+                anc = anchors[keep]
+                # Decode boxes
+                boxes = self._decode_boxes(anc, reg)
+                # Clamp to image boundaries
+                boxes[:, 0].clamp_(min=0)
+                boxes[:, 1].clamp_(min=0)
+                boxes[:, 2].clamp_(max=img_shape[3])
+                boxes[:, 3].clamp_(max=img_shape[2])
+                all_boxes.append(boxes)
+                all_scores.append(scores)
+                if self.head.use_landmarks and 'lmk_preds' in head_out:
+                    lmk = head_out['lmk_preds'][i][b].permute(1, 2, 0).reshape(-1, 10)[keep]
+                    lmk_decoded = self._decode_landmarks(anc, lmk)
+                    all_lmk.append(lmk_decoded)
+            if not all_boxes:
+                results.append({
+                    'boxes': torch.empty(0, 4, device=device),
+                    'scores': torch.empty(0, device=device),
+                })
+                continue
+            all_boxes = torch.cat(all_boxes, dim=0)
+            all_scores = torch.cat(all_scores, dim=0)
+            # NMS
+            keep = self._nms(all_boxes, all_scores, self.nms_threshold)
+            keep = keep[:self.max_detections]
+            result = {
+                'boxes': all_boxes[keep],
+                'scores': all_scores[keep],
+            }
+            if all_lmk:
+                all_lmk = torch.cat(all_lmk, dim=0)
+                result['landmarks'] = all_lmk[keep]
+            results.append(result)
+        return results
+    def _decode_boxes(self, anchors: torch.Tensor, pred: torch.Tensor) -> torch.Tensor:
+        """Decode box predictions relative to anchors (distance-based)."""
+        anchor_cx = (anchors[:, 0] + anchors[:, 2]) / 2
+        anchor_cy = (anchors[:, 1] + anchors[:, 3]) / 2
+        anchor_w = anchors[:, 2] - anchors[:, 0]
+        anchor_h = anchors[:, 3] - anchors[:, 1]
+        x1 = anchor_cx - pred[:, 0] * anchor_w
+        y1 = anchor_cy - pred[:, 1] * anchor_h
+        x2 = anchor_cx + pred[:, 2] * anchor_w
+        y2 = anchor_cy + pred[:, 3] * anchor_h
+        return torch.stack([x1, y1, x2, y2], dim=1)
+    def _decode_landmarks(self, anchors: torch.Tensor, pred: torch.Tensor) -> torch.Tensor:
+        """Decode landmark predictions relative to anchors."""
+        anchor_cx = (anchors[:, 0] + anchors[:, 2]) / 2
+        anchor_cy = (anchors[:, 1] + anchors[:, 3]) / 2
+        anchor_w = anchors[:, 2] - anchors[:, 0]
+        anchor_h = anchors[:, 3] - anchors[:, 1]
+        decoded = pred.clone()
+        for i in range(5):
+            decoded[:, i*2] = anchor_cx + pred[:, i*2] * anchor_w
+            decoded[:, i*2+1] = anchor_cy + pred[:, i*2+1] * anchor_h
+        return decoded
+    @staticmethod
+    def _compute_iou_single(boxes1: torch.Tensor, boxes2: torch.Tensor) -> torch.Tensor:
+        """Compute elementwise IoU between paired boxes. [N,4] × [N,4] → [N]"""
+        inter_x1 = torch.max(boxes1[:, 0], boxes2[:, 0])
+        inter_y1 = torch.max(boxes1[:, 1], boxes2[:, 1])
+        inter_x2 = torch.min(boxes1[:, 2], boxes2[:, 2])
+        inter_y2 = torch.min(boxes1[:, 3], boxes2[:, 3])
+        inter = (inter_x2 - inter_x1).clamp(min=0) * (inter_y2 - inter_y1).clamp(min=0)
+        area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
+        area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
+        union = area1 + area2 - inter
+        return inter / (union + 1e-6)
+    @staticmethod
+    def _nms(boxes: torch.Tensor, scores: torch.Tensor,
+             threshold: float) -> torch.Tensor:
+        """Non-Maximum Suppression. Returns kept indices."""
+        if boxes.shape[0] == 0:
+            return torch.empty(0, dtype=torch.long, device=boxes.device)
+        # Use torchvision NMS if available, else pure PyTorch
+        try:
+            from torchvision.ops import nms
+            return nms(boxes, scores, threshold)
+        except ImportError:
+            pass
+        # Pure PyTorch NMS fallback
+        x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
+        areas = (x2 - x1) * (y2 - y1)
+        order = scores.argsort(descending=True)
+        keep = []
+        while order.numel() > 0:
+            i = order[0].item()
+            keep.append(i)
+            if order.numel() == 1:
+                break
+            xx1 = torch.max(x1[i], x1[order[1:]])
+            yy1 = torch.max(y1[i], y1[order[1:]])
+            xx2 = torch.min(x2[i], x2[order[1:]])
+            yy2 = torch.min(y2[i], y2[order[1:]])
+            inter = (xx2 - xx1).clamp(min=0) * (yy2 - yy1).clamp(min=0)
+            iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-6)
+            mask = iou <= threshold
+            order = order[1:][mask]
+        return torch.tensor(keep, dtype=torch.long, device=boxes.device)
+# ──────────────────────── Model Builder ────────────────────────
+MODEL_CONFIGS = {
+    'scrfd_34g': {
+        'backbone': 'scrfd_34g',
+        'neck_out': 64,
+        'head_feat': 64,
+        'head_convs': 3,
+    },
+    'scrfd_10g': {
+        'backbone': 'scrfd_10g',
+        'neck_out': 56,
+        'head_feat': 56,
+        'head_convs': 2,
+    },
+    'scrfd_2.5g': {
+        'backbone': 'scrfd_2.5g',
+        'neck_out': 40,
+        'head_feat': 40,
+        'head_convs': 2,
+    },
+    'scrfd_0.5g': {
+        'backbone': 'scrfd_0.5g',
+        'neck_out': 16,
+        'head_feat': 16,
+        'head_convs': 2,
+    },
+}
+def build_detector(name: str, use_landmarks: bool = False,
+                   score_threshold: float = 0.3,
+                   nms_threshold: float = 0.4,
+                   **kwargs) -> SCRFD:
+    """
+    Build a complete SCRFD detector by name.
+    Args:
+        name: Model name ('scrfd_34g', 'scrfd_10g', 'scrfd_2.5g', 'scrfd_0.5g')
+        use_landmarks: Enable 5-point landmark prediction
+        score_threshold: Detection confidence threshold
+        nms_threshold: NMS IoU threshold
+    Returns:
+        Complete SCRFD detector ready for training or inference
+    """
+    if name not in MODEL_CONFIGS:
+        raise ValueError(f"Unknown model: {name}. Options: {list(MODEL_CONFIGS.keys())}")
+    cfg = MODEL_CONFIGS[name]
+    backbone = build_backbone(cfg['backbone'])
+    neck = PAFPN(backbone.out_channels, out_channels=cfg['neck_out'])
+    head = SCRFDHead(
+        in_channels=cfg['neck_out'],
+        feat_channels=cfg['head_feat'],
+        stacked_convs=cfg['head_convs'],
+        use_landmarks=use_landmarks,
+    )
+    anchor_gen = AnchorGenerator()
+    assigner = ATSSAssigner(topk=9)
+    model = SCRFD(
+        backbone=backbone,
+        neck=neck,
+        head=head,
+        anchor_generator=anchor_gen,
+        assigner=assigner,
+        score_threshold=score_threshold,
+        nms_threshold=nms_threshold,
+        **kwargs,
+    )
+    return model