Spaces:

facebook
/

sapiens2-pose

Running on Zero

App Files Files Community

Rawal Khirodkar commited on 14 days ago

Commit

dbdd74a

1 Parent(s): 8aae515

Switch pose detector from mmdet/RTMDet to DETR (transformers, Apache 2.0)

Browse files

Files changed (4) hide show

app.py +26 -22
assets/rtmdet_m_640-8xb32_coco-person_no_nms.py +0 -20
detector_utils.py +0 -196
requirements.txt +3 -5

app.py CHANGED Viewed

@@ -35,8 +35,7 @@ from sapiens.pose.datasets import UDPHeatmap, parse_pose_metainfo
 from sapiens.pose.evaluators import nms
 from sapiens.pose.models import init_model
-from detector_utils import adapt_mmdet_pipeline
-from mmdet.apis import inference_detector, init_detector
 from pose_render_utils import visualize_keypoints
@@ -72,9 +71,7 @@ POSE_MODELS = {
 }
 DEFAULT_SIZE = "1B"
-DETECTOR_REPO = "facebook/sapiens-pose-bbox-detector"
-DETECTOR_CKPT_FILENAME = "rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth"
-DETECTOR_CONFIG = os.path.join(ASSETS_DIR, "rtmdet_m_640-8xb32_coco-person_no_nms.py")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 BBOX_THR = 0.3
@@ -85,7 +82,7 @@ NMS_THR = 0.3
 # Model cache (load once, reuse across requests)
 _pose_model_cache: dict = {}
-_detector_cache = None
 _metainfo_cache = None
@@ -98,13 +95,12 @@ def _get_metainfo():
 def _get_detector():
-    global _detector_cache
-    if _detector_cache is None:
-        ckpt = hf_hub_download(repo_id=DETECTOR_REPO, filename=DETECTOR_CKPT_FILENAME)
-        det = init_detector(DETECTOR_CONFIG, ckpt, device=DEVICE)
-        det.cfg = adapt_mmdet_pipeline(det.cfg)
-        _detector_cache = det
-    return _detector_cache
 def _get_pose_model(size: str):
@@ -133,15 +129,23 @@ print("[startup] ready.")
 # -----------------------------------------------------------------------------
 # Inference
-def _detect_persons(image_bgr: np.ndarray) -> np.ndarray:
-    detector = _get_detector()
-    det = inference_detector(detector, image_bgr)
-    inst = det.pred_instances.cpu().numpy()
-    bboxes = np.concatenate((inst.bboxes, inst.scores[:, None]), axis=1)
-    bboxes = bboxes[(inst.labels == 0) & (inst.scores > BBOX_THR)]
-    bboxes = bboxes[nms(bboxes, NMS_THR), :4]  # x1,y1,x2,y2
     if len(bboxes) == 0:
-        h, w = image_bgr.shape[:2]
         bboxes = np.array([[0, 0, w - 1, h - 1]], dtype=np.float32)
     return bboxes
@@ -181,7 +185,7 @@ def predict(image: Image.Image, size: str, kpt_thr: float):
     image_rgb = np.array(image.convert("RGB"))
     image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
-    bboxes = _detect_persons(image_bgr)
     model = _get_pose_model(size)
     keypoints, scores = _estimate_pose(image_bgr, bboxes, model)

 from sapiens.pose.evaluators import nms
 from sapiens.pose.models import init_model
+from transformers import DetrForObjectDetection, DetrImageProcessor
 from pose_render_utils import visualize_keypoints
 }
 DEFAULT_SIZE = "1B"
+DETECTOR_MODEL_ID = "facebook/detr-resnet-50"  # COCO person = label 1
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 BBOX_THR = 0.3
 # Model cache (load once, reuse across requests)
 _pose_model_cache: dict = {}
+_detector_cache: dict = {}
 _metainfo_cache = None
 def _get_detector():
+    if "model" not in _detector_cache:
+        proc = DetrImageProcessor.from_pretrained(DETECTOR_MODEL_ID)
+        model = DetrForObjectDetection.from_pretrained(DETECTOR_MODEL_ID).eval().to(DEVICE)
+        _detector_cache["proc"] = proc
+        _detector_cache["model"] = model
+    return _detector_cache["proc"], _detector_cache["model"]
 def _get_pose_model(size: str):
 # -----------------------------------------------------------------------------
 # Inference
+def _detect_persons(image_rgb: np.ndarray) -> np.ndarray:
+    proc, model = _get_detector()
+    pil_img = Image.fromarray(image_rgb)
+    inputs = proc(images=pil_img, return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    target_sizes = torch.tensor([image_rgb.shape[:2]], device=DEVICE)  # (h, w)
+    results = proc.post_process_object_detection(
+        outputs, target_sizes=target_sizes, threshold=BBOX_THR
+    )[0]
+    person_mask = results["labels"] == 1  # COCO person
+    boxes = results["boxes"][person_mask].cpu().numpy()  # (N, 4) x1,y1,x2,y2
+    scores = results["scores"][person_mask].cpu().numpy().reshape(-1, 1)
+    bboxes = np.concatenate([boxes, scores], axis=1)  # (N, 5)
+    bboxes = bboxes[nms(bboxes, NMS_THR), :4]
     if len(bboxes) == 0:
+        h, w = image_rgb.shape[:2]
         bboxes = np.array([[0, 0, w - 1, h - 1]], dtype=np.float32)
     return bboxes
     image_rgb = np.array(image.convert("RGB"))
     image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
+    bboxes = _detect_persons(image_rgb)
     model = _get_pose_model(size)
     keypoints, scores = _estimate_pose(image_bgr, bboxes, model)

assets/rtmdet_m_640-8xb32_coco-person_no_nms.py DELETED Viewed

@@ -1,20 +0,0 @@
-_base_ = 'mmdet::rtmdet/rtmdet_m_8xb32-300e_coco.py'
-checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth'  # noqa
-model = dict(
-    backbone=dict(
-        init_cfg=dict(
-            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
-    bbox_head=dict(num_classes=1),
-    test_cfg=dict(
-        nms_pre=1000,
-        min_bbox_size=0,
-        score_thr=0.05,
-        nms=None,
-        max_per_img=100))
-train_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', ))))
-val_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', ))))
-test_dataloader = val_dataloader

detector_utils.py DELETED Viewed

@@ -1,196 +0,0 @@
-from typing import List, Optional, Sequence, Union
-import torch
-import cv2
-import numpy as np
-from mmcv.ops import RoIPool
-from mmengine.dataset import Compose, pseudo_collate
-from mmengine.device import get_device
-from mmengine.registry import init_default_scope
-from mmdet.apis import inference_detector, init_detector
-from mmdet.structures import DetDataSample, SampleList
-from mmdet.utils import get_test_pipeline_cfg
-ImagesType = Union[str, np.ndarray, Sequence[str], Sequence[np.ndarray]]
-def nms(dets: np.ndarray, thr: float):
-    """Greedily select boxes with high confidence and overlap <= thr.
-    Args:
-        dets (np.ndarray): [[x1, y1, x2, y2, score]].
-        thr (float): Retain overlap < thr.
-    Returns:
-        list: Indexes to keep.
-    """
-    if len(dets) == 0:
-        return []
-    x1 = dets[:, 0]
-    y1 = dets[:, 1]
-    x2 = dets[:, 2]
-    y2 = dets[:, 3]
-    scores = dets[:, 4]
-    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
-    order = scores.argsort()[::-1]
-    keep = []
-    while len(order) > 0:
-        i = order[0]
-        keep.append(i)
-        xx1 = np.maximum(x1[i], x1[order[1:]])
-        yy1 = np.maximum(y1[i], y1[order[1:]])
-        xx2 = np.minimum(x2[i], x2[order[1:]])
-        yy2 = np.minimum(y2[i], y2[order[1:]])
-        w = np.maximum(0.0, xx2 - xx1 + 1)
-        h = np.maximum(0.0, yy2 - yy1 + 1)
-        inter = w * h
-        ovr = inter / (areas[i] + areas[order[1:]] - inter)
-        inds = np.where(ovr <= thr)[0]
-        order = order[inds + 1]
-    return keep
-def adapt_mmdet_pipeline(cfg):
-    """Converts pipeline types in MMDetection's test dataloader to use the
-    'mmdet' namespace.
-    Args:
-        cfg (ConfigDict): Configuration dictionary for MMDetection.
-    Returns:
-        ConfigDict: Configuration dictionary with updated pipeline types.
-    """
-    # use lazy import to avoid hard dependence on mmdet
-    from mmdet.datasets import transforms
-    if 'test_dataloader' not in cfg:
-        return cfg
-    pipeline = cfg.test_dataloader.dataset.pipeline
-    for trans in pipeline:
-        if trans['type'] in dir(transforms):
-            trans['type'] = 'mmdet.' + trans['type']
-    return cfg
-def inference_detector(
-    model: torch.nn.Module,
-    imgs: ImagesType,
-    test_pipeline: Optional[Compose] = None,
-    text_prompt: Optional[str] = None,
-    custom_entities: bool = False,
-) -> Union[DetDataSample, SampleList]:
-    """Inference image(s) with the detector.
-    Args:
-        model (nn.Module): The loaded detector.
-        imgs (str, ndarray, Sequence[str/ndarray]):
-           Either image files or loaded images.
-        test_pipeline (:obj:`Compose`): Test pipeline.
-    Returns:
-        :obj:`DetDataSample` or list[:obj:`DetDataSample`]:
-        If imgs is a list or tuple, the same length list type results
-        will be returned, otherwise return the detection results directly.
-    """
-    if isinstance(imgs, torch.Tensor):
-        if imgs.is_cuda:
-            imgs = imgs.cpu()
-        # Remove batch dimension and transpose
-        imgs = imgs.squeeze(0).permute(1, 2, 0).numpy()
-        # Ensure the data type is appropriate (uint8 for most image processing functions)
-        imgs = (imgs * 255).astype(np.uint8)
-    if isinstance(imgs, (list, tuple)) or (isinstance(imgs, np.ndarray) and len(imgs.shape) == 4):
-        is_batch = True
-    else:
-        imgs = [imgs]
-        is_batch = False
-    cfg = model.cfg
-    if test_pipeline is None:
-        cfg = cfg.copy()
-        test_pipeline = get_test_pipeline_cfg(cfg)
-        if isinstance(imgs[0], np.ndarray):
-            # Calling this method across libraries will result
-            # in module unregistered error if not prefixed with mmdet.
-            test_pipeline[0].type = "mmdet.LoadImageFromNDArray"
-        test_pipeline = Compose(test_pipeline)
-    if model.data_preprocessor.device.type == "cpu":
-        for m in model.modules():
-            assert not isinstance(
-                m, RoIPool
-            ), "CPU inference with RoIPool is not supported currently."
-    result_list = []
-    for i, img in enumerate(imgs):
-        # prepare data
-        if isinstance(img, np.ndarray):
-            # TODO: remove img_id.
-            data_ = dict(img=img, img_id=0)
-        else:
-            # TODO: remove img_id.
-            data_ = dict(img_path=img, img_id=0)
-        if text_prompt:
-            data_["text"] = text_prompt
-            data_["custom_entities"] = custom_entities
-        # build the data pipeline
-        data_ = test_pipeline(data_)
-        data_["inputs"] = [data_["inputs"]]
-        data_["data_samples"] = [data_["data_samples"]]
-        # forward the model
-        with torch.no_grad(), torch.autocast(device_type=get_device(), dtype=torch.bfloat16):
-            results = model.test_step(data_)[0]
-        result_list.append(results)
-    if not is_batch:
-        return result_list[0]
-    else:
-        return result_list
-def process_one_image_bbox(pred_instance, det_cat_id, bbox_thr, nms_thr):
-    bboxes = np.concatenate(
-        (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1
-    )
-    bboxes = bboxes[
-        np.logical_and(
-            pred_instance.labels == det_cat_id,
-            pred_instance.scores > bbox_thr,
-        )
-    ]
-    bboxes = bboxes[nms(bboxes, nms_thr), :4]
-    return bboxes
-def process_images_detector(imgs, detector):
-    """Visualize predicted keypoints (and heatmaps) of one image."""
-    # predict bbox
-    det_results = inference_detector(detector, imgs)
-    pred_instances = list(
-        map(lambda det_result: det_result.pred_instances.numpy(), det_results)
-    )
-    bboxes_batch = list(
-        map(
-            lambda pred_instance: process_one_image_bbox(
-                pred_instance, 0, 0.3, 0.3 ## argparse.Namespace(det_cat_id=0, bbox_thr=0.3, nms_thr=0.3),
-            ),
-            pred_instances,
-        )
-    )
-    return bboxes_batch

requirements.txt CHANGED Viewed

@@ -1,7 +1,6 @@
 gradio==4.42.0
 spaces
-# Pinned to versions verified working together (sapiens2 + mmdet stack).
 torch==2.7.1
 torchvision==0.22.1
@@ -21,7 +20,6 @@ termcolor
 accelerate
 rich
-# RTMDet person detector stack (mmcv 2.2.0 builds cleanly on Python 3.12 + torch 2.7).
-mmengine==0.10.7
-mmcv==2.2.0
-mmdet==3.3.0

 gradio==4.42.0
 spaces
 torch==2.7.1
 torchvision==0.22.1
 accelerate
 rich
+# Person bbox detector — DETR via HuggingFace transformers (Apache 2.0, GPU-friendly).
+transformers
+timm