| import sys |
| import os |
| from pathlib import Path |
|
|
| import numpy as np |
| from tqdm import tqdm |
| import torch |
| from ultralytics import YOLO |
|
|
| |
| current_file_dir = os.path.dirname(os.path.abspath(__file__)) |
| hawor_path = os.path.abspath(os.path.join(current_file_dir, '..', '..', 'thirdparty', 'HaWoR')) |
| if hawor_path not in sys.path: |
| sys.path.insert(0, hawor_path) |
|
|
| from thirdparty.HaWoR.lib.models.hawor import HAWOR |
| from thirdparty.HaWoR.lib.pipeline.tools import parse_chunks |
| from thirdparty.HaWoR.lib.eval_utils.custom_utils import interpolate_bboxes |
| from thirdparty.HaWoR.hawor.utils.rotation import angle_axis_to_rotation_matrix, rotation_matrix_to_angle_axis |
| from thirdparty.HaWoR.hawor.configs import get_config |
|
|
|
|
| def load_hawor(checkpoint_path: str): |
| """ |
| Loads the HAWOR model and its configuration from a checkpoint. |
| |
| Args: |
| checkpoint_path (str): Path to the model checkpoint file or HuggingFace repo ID (e.g., 'username/model-name'). |
| |
| Returns: |
| tuple: (HAWOR model instance, model configuration object) |
| """ |
| from huggingface_hub import hf_hub_download |
| |
| |
| if '/' in checkpoint_path and not os.path.exists(checkpoint_path): |
| |
| print(f"Downloading model from HuggingFace: {checkpoint_path}") |
| checkpoint_file = hf_hub_download(repo_id=checkpoint_path, filename="checkpoints/hawor.ckpt") |
| config_file = hf_hub_download(repo_id=checkpoint_path, filename="config.yaml") |
| print(f"Downloaded checkpoint to: {checkpoint_file}") |
| print(f"Downloaded config to: {config_file}") |
| print(f"Checkpoint exists: {os.path.exists(checkpoint_file)}") |
| model_cfg_path = Path(config_file) |
| else: |
| |
| checkpoint_file = checkpoint_path |
| model_cfg_path = Path(checkpoint_path).parent.parent / 'config.yaml' |
| print(f"Using local checkpoint: {checkpoint_file}") |
| print(f"Using local config: {model_cfg_path}") |
| |
| print(f"Loading config from: {model_cfg_path}") |
| model_cfg = get_config(str(model_cfg_path), update_cachedir=True) |
|
|
| |
| if (model_cfg.MODEL.BACKBONE.TYPE == 'vit') and ('BBOX_SHAPE' not in model_cfg.MODEL): |
| model_cfg.defrost() |
| assert model_cfg.MODEL.IMAGE_SIZE == 256, \ |
| f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for ViT backbone" |
| model_cfg.MODEL.BBOX_SHAPE = [192, 256] |
| model_cfg.freeze() |
|
|
| |
| print(f"Loading HAWOR model from checkpoint: {checkpoint_file}") |
| |
| model = HAWOR.load_from_checkpoint( |
| checkpoint_file, |
| strict=False, |
| cfg=model_cfg, |
| map_location='cpu' |
| ) |
|
|
| return model, model_cfg |
|
|
| class HaworPipeline: |
| """ |
| Pipeline for hand detection, tracking, and HAWOR motion estimation. |
| """ |
|
|
| def __init__( |
| self, |
| model_path: str = '', |
| detector_path: str = '', |
| device: torch.device = torch.device("cuda") |
| ): |
| """ |
| Initializes the HAWOR model and detector path. |
| |
| Args: |
| model_path (str): Path to the HAWOR checkpoint. |
| detector_path (str): Path to the hand detector (YOLO) weights. |
| device (torch.device): Device to load models onto. |
| """ |
| self.device = device |
| self.detector_path = detector_path |
| self._checkpoint_path = model_path |
| self._original_device = device |
| |
| model, model_cfg = load_hawor(model_path) |
| model = model.to(device) |
| model.eval() |
| self.model = model |
| self.model_cfg = model_cfg |
|
|
| def recon( |
| self, |
| images: list, |
| img_focal: float, |
| thresh: float = 0.2, |
| single_image: bool = False |
| ) -> dict: |
| |
| """ |
| Performs hand detection, tracking, and HAWOR-based 3D reconstruction. |
| |
| Args: |
| images (list): List of consecutive input image frames (cv2/numpy format). |
| img_focal (float): Focal length of the camera in pixels. |
| thresh (float): Confidence threshold for hand detection. |
| single_image (bool): Flag for single-image processing mode. |
| |
| Returns: |
| dict: Dictionary of reconstruction results for 'left' and 'right' hands. |
| """ |
| |
| hand_det_model = YOLO(self.detector_path) |
| _, tracks = detect_track(images, hand_det_model, thresh=thresh) |
| |
| recon_results = hawor_motion_estimation( |
| images, tracks, self.model, img_focal, single_image=single_image |
| ) |
| |
| del hand_det_model |
| |
| return recon_results |
|
|
| |
| def detect_track(imgfiles: list, hand_det_model: YOLO, thresh: float = 0.5) -> tuple: |
| """ |
| Detects and tracks hands across a sequence of images using YOLO. |
| |
| Args: |
| imgfiles (list): List of image frames. |
| hand_det_model (YOLO): The initialized YOLO hand detection model. |
| thresh (float): Confidence threshold for detection. |
| |
| Returns: |
| tuple: (list of boxes (unused in original logic), dict of tracks) |
| """ |
| boxes_ = [] |
| tracks = {} |
|
|
| for t, img_cv2 in enumerate(tqdm(imgfiles)): |
|
|
| |
| with torch.no_grad(): |
| with torch.amp.autocast('cuda'): |
| results = hand_det_model.track(img_cv2, conf=thresh, persist=True, verbose=False) |
| |
| boxes = results[0].boxes.xyxy.cpu().numpy() |
| confs = results[0].boxes.conf.cpu().numpy() |
| handedness = results[0].boxes.cls.cpu().numpy() |
| if not results[0].boxes.id is None: |
| track_id = results[0].boxes.id.cpu().numpy() |
| else: |
| track_id = [-1] * len(boxes) |
|
|
| boxes = np.hstack([boxes, confs[:, None]]) |
|
|
| find_right = False |
| find_left = False |
|
|
| for idx, box in enumerate(boxes): |
| if track_id[idx] == -1: |
| if handedness[[idx]] > 0: |
| id = int(10000) |
| else: |
| id = int(5000) |
| else: |
| id = track_id[idx] |
| subj = dict() |
| subj['frame'] = t |
| subj['det'] = True |
| subj['det_box'] = boxes[[idx]] |
| subj['det_handedness'] = handedness[[idx]] |
| |
| if (not find_right and handedness[[idx]] > 0) or (not find_left and handedness[[idx]]==0): |
| if id in tracks: |
| tracks[id].append(subj) |
| else: |
| tracks[id] = [subj] |
|
|
| if handedness[[idx]] > 0: |
| find_right = True |
| elif handedness[[idx]] == 0: |
| find_left = True |
|
|
| return boxes_, tracks |
|
|
| |
| def hawor_motion_estimation( |
| imgfiles: list, |
| tracks: dict, |
| model: HAWOR, |
| img_focal: float, |
| single_image: bool = False |
| ) -> dict: |
| """ |
| Performs HAWOR 3D hand reconstruction on detected and tracked hand regions. |
| |
| Args: |
| imgfiles (list): List of image frames. |
| tracks (dict): Dictionary mapping track ID to a list of detection objects. |
| model (HAWOR): The initialized HAWOR model. |
| img_focal (float): Camera focal length. |
| single_image (bool): Flag for single-image processing mode. |
| |
| Returns: |
| dict: Reconstructed parameters ('left' and 'right' hand results). |
| """ |
|
|
| left_results = {} |
| right_results = {} |
| |
| tid = np.array([tr for tr in tracks]) |
|
|
| left_trk = [] |
| right_trk = [] |
| for k, idx in enumerate(tid): |
| trk = tracks[idx] |
|
|
| valid = np.array([t['det'] for t in trk]) |
| is_right = np.concatenate([t['det_handedness'] for t in trk])[valid] |
| |
| if is_right.sum() / len(is_right) < 0.5: |
| left_trk.extend(trk) |
| else: |
| right_trk.extend(trk) |
| left_trk = sorted(left_trk, key=lambda x: x['frame']) |
| right_trk = sorted(right_trk, key=lambda x: x['frame']) |
| final_tracks = { |
| 0: left_trk, |
| 1: right_trk |
| } |
| tid = [0, 1] |
|
|
| img = imgfiles[0] |
| img_center = [img.shape[1] / 2, img.shape[0] / 2] |
| H, W = img.shape[:2] |
|
|
| for idx in tid: |
| print(f"tracklet {idx}:") |
| trk = final_tracks[idx] |
|
|
| |
| valid = np.array([t['det'] for t in trk]) |
| if not single_image: |
| if valid.sum() < 2: |
| continue |
| else: |
| if valid.sum() < 1: |
| continue |
| boxes = np.concatenate([t['det_box'] for t in trk]) |
| non_zero_indices = np.where(np.any(boxes != 0, axis=1))[0] |
| first_non_zero = non_zero_indices[0] |
| last_non_zero = non_zero_indices[-1] |
| boxes[first_non_zero:last_non_zero+1] = interpolate_bboxes(boxes[first_non_zero:last_non_zero+1]) |
| valid[first_non_zero:last_non_zero+1] = True |
|
|
|
|
| boxes = boxes[first_non_zero:last_non_zero+1] |
| is_right = np.concatenate([t['det_handedness'] for t in trk])[valid] |
| frame = np.array([t['frame'] for t in trk])[valid] |
| |
| if is_right.sum() / len(is_right) < 0.5: |
| is_right = np.zeros((len(boxes), 1)) |
| else: |
| is_right = np.ones((len(boxes), 1)) |
|
|
| frame_chunks, boxes_chunks = parse_chunks(frame, boxes, min_len=1) |
|
|
| if len(frame_chunks) == 0: |
| continue |
|
|
| for frame_ck, boxes_ck in zip(frame_chunks, boxes_chunks): |
| print(f"inference from frame {frame_ck[0]} to {frame_ck[-1]}") |
| img_ck = [imgfiles[i] for i in frame_ck] |
| if is_right[0] > 0: |
| do_flip = False |
| else: |
| do_flip = True |
|
|
| results = model.inference(img_ck, boxes_ck, img_focal=img_focal, img_center=img_center, do_flip=do_flip) |
| |
| data_out = { |
| "init_root_orient": results["pred_rotmat"][None, :, 0], |
| "init_hand_pose": results["pred_rotmat"][None, :, 1:], |
| "init_trans": results["pred_trans"][None, :, 0], |
| "init_betas": results["pred_shape"][None, :] |
| } |
|
|
| |
| init_root = rotation_matrix_to_angle_axis(data_out["init_root_orient"]) |
| init_hand_pose = rotation_matrix_to_angle_axis(data_out["init_hand_pose"]) |
| if do_flip: |
| init_root[..., 1] *= -1 |
| init_root[..., 2] *= -1 |
| data_out["init_root_orient"] = angle_axis_to_rotation_matrix(init_root) |
| data_out["init_hand_pose"] = angle_axis_to_rotation_matrix(init_hand_pose) |
|
|
| s_frame = frame_ck[0] |
| e_frame = frame_ck[-1] |
|
|
| for frame_id in range(s_frame, e_frame+1): |
| result = {} |
| result['beta'] = data_out['init_betas'][0, frame_id-s_frame].cpu().numpy() |
| result['hand_pose'] = data_out['init_hand_pose'][0, frame_id-s_frame].cpu().numpy() |
| result['global_orient'] = data_out['init_root_orient'][0, frame_id-s_frame].cpu().numpy() |
| result['transl'] = data_out['init_trans'][0, frame_id-s_frame].cpu().numpy() |
| |
| if idx == 0: |
| left_results[frame_id] = result |
| else: |
| right_results[frame_id] = result |
| |
| reformat_results = {'left': left_results, 'right': right_results} |
|
|
| return reformat_results |
|
|
|
|