Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 28, 2025

Commit

28e0f6c

1 Parent(s): e21220a

Create utils/two_stage_processor.py

Browse files

Files changed (1) hide show

utils/two_stage_processor.py +306 -0

utils/two_stage_processor.py ADDED Viewed

	@@ -0,0 +1,306 @@

+"""
+Fixed SAM2 + MatAnyone Integration
+Corrects tensor dimension mismatches and ensures proper model cooperation
+"""
+import torch
+import numpy as np
+import cv2
+from typing import Optional, Tuple, List
+import logging
+logger = logging.getLogger(__name__)
+class TwoStageProcessor:
+    """Properly integrated SAM2 + MatAnyone processor"""
+    def __init__(self, sam2_model, matanyone_model, device='cuda'):
+        self.sam2 = sam2_model
+        self.matanyone = matanyone_model
+        self.device = device
+        logger.info(f"TwoStageProcessor initialized on {device}")
+    def process_frame(self, frame: np.ndarray, prev_mask: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Process a single frame through SAM2 + MatAnyone
+        Args:
+            frame: RGB frame (H, W, 3) as numpy array
+            prev_mask: Optional previous frame mask for temporal consistency
+        Returns:
+            processed_frame: Frame with background removed (H, W, 4) RGBA
+            mask: Binary mask (H, W) as uint8
+        """
+        H, W = frame.shape[:2]
+        try:
+            # Step 1: Get mask from SAM2
+            mask = self._get_sam2_mask(frame, prev_mask)
+            # Step 2: Process with MatAnyone
+            if self.matanyone is not None and mask is not None:
+                processed = self._process_with_matanyone(frame, mask)
+                if processed is not None:
+                    return processed, mask
+            # Fallback: Simple alpha composite if MatAnyone fails
+            return self._simple_composite(frame, mask), mask
+        except Exception as e:
+            logger.error(f"Frame processing failed: {e}")
+            # Return original frame with full opacity as fallback
+            rgba = np.zeros((H, W, 4), dtype=np.uint8)
+            rgba[:, :, :3] = frame
+            rgba[:, :, 3] = 255
+            return rgba, np.ones((H, W), dtype=np.uint8) * 255
+    def _get_sam2_mask(self, frame: np.ndarray, prev_mask: Optional[np.ndarray]) -> np.ndarray:
+        """Get segmentation mask from SAM2"""
+        H, W = frame.shape[:2]
+        try:
+            if hasattr(self.sam2, 'generate_mask'):
+                # Proper SAM2 call
+                with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16):
+                    # Convert frame to tensor
+                    frame_tensor = torch.from_numpy(frame).to(self.device).float() / 255.0
+                    frame_tensor = frame_tensor.permute(2, 0, 1).unsqueeze(0)  # (1, 3, H, W)
+                    # Get mask from SAM2
+                    if prev_mask is not None:
+                        prev_mask_tensor = torch.from_numpy(prev_mask).to(self.device).float() / 255.0
+                        prev_mask_tensor = prev_mask_tensor.unsqueeze(0).unsqueeze(0)  # (1, 1, H, W)
+                        mask_logits = self.sam2.generate_mask(frame_tensor, prev_mask_tensor)
+                    else:
+                        mask_logits = self.sam2.generate_mask(frame_tensor)
+                    # Convert to binary mask
+                    mask = (mask_logits.squeeze().cpu().numpy() > 0).astype(np.uint8) * 255
+                    return mask
+            else:
+                # Fallback SAM2 - create center-weighted mask
+                logger.warning("Using fallback mask generation")
+                return self._generate_center_mask(H, W)
+        except Exception as e:
+            logger.error(f"SAM2 mask generation failed: {e}")
+            return self._generate_center_mask(H, W)
+    def _generate_center_mask(self, H: int, W: int) -> np.ndarray:
+        """Generate a center-weighted elliptical mask as fallback"""
+        mask = np.zeros((H, W), dtype=np.uint8)
+        center_x, center_y = W // 2, H // 2
+        axes_x, axes_y = W // 3, H // 3
+        y, x = np.ogrid[:H, :W]
+        mask_area = ((x - center_x) / axes_x) ** 2 + ((y - center_y) / axes_y) ** 2 <= 1
+        mask[mask_area] = 255
+        # Smooth edges
+        mask = cv2.GaussianBlur(mask, (21, 21), 10)
+        mask = (mask > 128).astype(np.uint8) * 255
+        return mask
+    def _process_with_matanyone(self, frame: np.ndarray, mask: np.ndarray) -> Optional[np.ndarray]:
+        """Process frame with MatAnyone for high-quality matting"""
+        try:
+            H, W = frame.shape[:2]
+            # Ensure correct input formats for MatAnyone
+            # Frame should be (H, W, 3) uint8
+            if frame.dtype != np.uint8:
+                frame = (frame * 255).astype(np.uint8) if frame.max() <= 1 else frame.astype(np.uint8)
+            # Mask should be (H, W, 1) float32 normalized to [0, 1]
+            mask_input = mask.astype(np.float32) / 255.0
+            if len(mask_input.shape) == 2:
+                mask_input = np.expand_dims(mask_input, axis=2)  # (H, W, 1)
+            # Prepare tensors for MatAnyone
+            with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16):
+                # Convert to tensors with correct dimensions
+                frame_tensor = torch.from_numpy(frame).to(self.device).float() / 255.0
+                frame_tensor = frame_tensor.permute(2, 0, 1).unsqueeze(0)  # (1, 3, H, W)
+                mask_tensor = torch.from_numpy(mask_input).to(self.device).float()
+                mask_tensor = mask_tensor.permute(2, 0, 1).unsqueeze(0)  # (1, 1, H, W)
+                # Call MatAnyone with correct tensor shapes
+                if hasattr(self.matanyone, '__call__'):
+                    # MatAnyone expects: image (1, 3, H, W), mask (1, 1, H, W)
+                    result = self.matanyone(frame_tensor, mask_tensor)
+                    if result is not None:
+                        # Extract alpha matte
+                        if isinstance(result, tuple):
+                            alpha = result[0]  # Assume first element is alpha
+                        else:
+                            alpha = result
+                        # Convert back to numpy
+                        alpha = alpha.squeeze(0).squeeze(0).cpu().numpy()  # (H, W)
+                        alpha = (alpha * 255).astype(np.uint8)
+                        # Create RGBA image
+                        rgba = np.zeros((H, W, 4), dtype=np.uint8)
+                        rgba[:, :, :3] = frame
+                        rgba[:, :, 3] = alpha
+                        return rgba
+                elif hasattr(self.matanyone, 'process'):
+                    # Alternative MatAnyone API
+                    result = self.matanyone.process(frame, mask_input)
+                    if result is not None:
+                        return result
+            return None
+        except Exception as e:
+            logger.warning(f"MatAnyone processing failed: {e}")
+            return None
+    def _simple_composite(self, frame: np.ndarray, mask: np.ndarray) -> np.ndarray:
+        """Simple RGBA composite as final fallback"""
+        H, W = frame.shape[:2]
+        # Apply some edge refinement to the mask
+        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
+        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
+        mask = cv2.GaussianBlur(mask, (5, 5), 1)
+        # Create RGBA
+        rgba = np.zeros((H, W, 4), dtype=np.uint8)
+        rgba[:, :, :3] = frame
+        rgba[:, :, 3] = mask
+        return rgba
+    def process_video(self, video_path: str, output_path: str, progress_callback=None):
+        """Process entire video through the pipeline"""
+        import cv2
+        cap = cv2.VideoCapture(video_path)
+        fps = int(cap.get(cv2.CAP_PROP_FPS))
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        # Setup video writer with transparency (use PNG codec or similar)
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, fps, (W, H), True)
+        prev_mask = None
+        frame_idx = 0
+        logger.info(f"Processing {total_frames} frames at {fps}fps")
+        try:
+            while cap.isOpened():
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                # Convert BGR to RGB
+                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                # Process frame
+                processed, mask = self.process_frame(frame_rgb, prev_mask)
+                prev_mask = mask  # Use for temporal consistency
+                # Convert RGBA to BGR for video writer (or handle alpha separately)
+                if processed.shape[2] == 4:
+                    # For now, composite on green background for compatibility
+                    green_bg = np.zeros((H, W, 3), dtype=np.uint8)
+                    green_bg[:, :, 1] = 255  # Pure green
+                    alpha = processed[:, :, 3:4] / 255.0
+                    rgb = processed[:, :, :3]
+                    composited = (rgb * alpha + green_bg * (1 - alpha)).astype(np.uint8)
+                    output_bgr = cv2.cvtColor(composited, cv2.COLOR_RGB2BGR)
+                else:
+                    output_bgr = cv2.cvtColor(processed, cv2.COLOR_RGB2BGR)
+                out.write(output_bgr)
+                frame_idx += 1
+                if progress_callback:
+                    progress_callback(frame_idx / total_frames)
+                if frame_idx % 30 == 0:
+                    logger.info(f"Processed {frame_idx}/{total_frames} frames")
+            logger.info(f"Video processing complete: {output_path}")
+        finally:
+            cap.release()
+            out.release()
+            cv2.destroyAllWindows()
+# Fix for the current MatAnyone loader issue
+class MatAnyoneLoaderFix:
+    """Fixes for the MatAnyone dimension mismatch issues"""
+    @staticmethod
+    def fix_matanyone_call(matanyone_model):
+        """Wrap MatAnyone model to handle dimension issues"""
+        original_call = matanyone_model.__call__ if hasattr(matanyone_model, '__call__') else None
+        def fixed_call(image, mask, *args, **kwargs):
+            try:
+                # Ensure image is (1, 3, H, W)
+                if len(image.shape) == 3:
+                    image = image.unsqueeze(0)
+                if image.shape[1] != 3:
+                    image = image.permute(0, 3, 1, 2)
+                # Ensure mask is (1, 1, H, W)
+                if len(mask.shape) == 2:
+                    mask = mask.unsqueeze(0).unsqueeze(0)
+                elif len(mask.shape) == 3:
+                    if mask.shape[0] != 1:
+                        mask = mask.unsqueeze(0)
+                    if mask.shape[1] != 1 and mask.shape[-1] == 1:
+                        mask = mask.permute(0, 3, 1, 2)
+                # Ensure same spatial dimensions
+                if image.shape[-2:] != mask.shape[-2:]:
+                    mask = torch.nn.functional.interpolate(
+                        mask, size=image.shape[-2:], mode='bilinear', align_corners=False
+                    )
+                # Call original with fixed dimensions
+                if original_call:
+                    return original_call(image, mask, *args, **kwargs)
+                else:
+                    return None
+            except Exception as e:
+                logger.error(f"MatAnyone call fix failed: {e}")
+                return None
+        if hasattr(matanyone_model, '__call__'):
+            matanyone_model.__call__ = fixed_call
+        return matanyone_model
+# Integration with existing code
+def initialize_two_stage_processor(sam2_loader, matanyone_loader, device='cuda'):
+    """Initialize the fixed two-stage processor"""
+    # Apply MatAnyone fixes
+    if matanyone_loader and hasattr(matanyone_loader, 'model'):
+        matanyone_loader.model = MatAnyoneLoaderFix.fix_matanyone_call(matanyone_loader.model)
+    processor = TwoStageProcessor(
+        sam2_model=sam2_loader.model if sam2_loader else None,
+        matanyone_model=matanyone_loader.model if matanyone_loader else None,
+        device=device
+    )
+    return processor