Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 26, 2025

Commit

e94d263

1 Parent(s): 19a2b07

Update utils/refinement.py

Browse files

Files changed (1) hide show

utils/refinement.py +33 -5

utils/refinement.py CHANGED Viewed

@@ -141,16 +141,29 @@ def _refine_with_matanyone(
         image_tensor = torch.from_numpy(image_rgb).permute(2, 0, 1).float() / 255.0
         image_tensor = image_tensor.unsqueeze(0).to(device)  # Add batch dimension and move to GPU
         # Ensure mask is binary uint8
         if mask.dtype != np.uint8:
             mask = (mask * 255).astype(np.uint8) if mask.max() <= 1 else mask.astype(np.uint8)
-        if mask.ndim == 3:
-            mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
         # Convert mask to tensor and move to GPU
         mask_tensor = torch.from_numpy(mask).float() / 255.0
         mask_tensor = mask_tensor.unsqueeze(0).unsqueeze(0).to(device)  # (1, 1, H, W) on GPU
         # Try different methods on InferenceCore
         result = None
@@ -224,10 +237,18 @@ def _refine_batch_with_matanyone(
         # Prepare first mask for initialization
         first_mask = masks[0]
-        if first_mask.dtype != np.uint8:
-            first_mask = (first_mask * 255).astype(np.uint8)
         if first_mask.ndim == 3:
-            first_mask = cv2.cvtColor(first_mask, cv2.COLOR_BGR2GRAY)
         # Convert first mask to tensor and move to GPU
         first_mask_tensor = torch.from_numpy(first_mask).float() / 255.0
@@ -262,6 +283,13 @@ def _refine_batch_with_matanyone(
                 # Fallback to processing each frame with its mask
                 log.warning("MatAnyone batch processing not available, using frame-by-frame")
                 for frame_tensor, mask in zip(frame_tensors, masks):
                     mask_tensor = torch.from_numpy(mask).float() / 255.0
                     mask_tensor = mask_tensor.unsqueeze(0).unsqueeze(0).to(device)
                     frame_on_device = frame_tensor.unsqueeze(0).to(device)

         image_tensor = torch.from_numpy(image_rgb).permute(2, 0, 1).float() / 255.0
         image_tensor = image_tensor.unsqueeze(0).to(device)  # Add batch dimension and move to GPU
+        # CRITICAL: Ensure mask is 2D before processing
+        if mask.ndim == 3:
+            # Convert multi-channel to single channel
+            if mask.shape[2] == 3:
+                mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
+            else:
+                mask = mask[:, :, 0]
         # Ensure mask is binary uint8
         if mask.dtype != np.uint8:
             mask = (mask * 255).astype(np.uint8) if mask.max() <= 1 else mask.astype(np.uint8)
+        # Final verification that mask is 2D
+        assert mask.ndim == 2, f"Mask must be 2D after conversion, got shape {mask.shape}"
+        assert mask.shape == (h, w), f"Mask shape {mask.shape} doesn't match image shape ({h}, {w})"
         # Convert mask to tensor and move to GPU
         mask_tensor = torch.from_numpy(mask).float() / 255.0
         mask_tensor = mask_tensor.unsqueeze(0).unsqueeze(0).to(device)  # (1, 1, H, W) on GPU
+        # Verify tensor dimensions
+        assert mask_tensor.shape == (1, 1, h, w), f"Mask tensor wrong shape: {mask_tensor.shape}, expected (1, 1, {h}, {w})"
         # Try different methods on InferenceCore
         result = None
         # Prepare first mask for initialization
         first_mask = masks[0]
+        # CRITICAL: Ensure first mask is 2D
         if first_mask.ndim == 3:
+            if first_mask.shape[2] == 3:
+                first_mask = cv2.cvtColor(first_mask, cv2.COLOR_BGR2GRAY)
+            else:
+                first_mask = first_mask[:, :, 0]
+        if first_mask.dtype != np.uint8:
+            first_mask = (first_mask * 255).astype(np.uint8) if first_mask.max() <= 1 else first_mask.astype(np.uint8)
+        assert first_mask.ndim == 2, f"First mask must be 2D, got shape {first_mask.shape}"
         # Convert first mask to tensor and move to GPU
         first_mask_tensor = torch.from_numpy(first_mask).float() / 255.0
                 # Fallback to processing each frame with its mask
                 log.warning("MatAnyone batch processing not available, using frame-by-frame")
                 for frame_tensor, mask in zip(frame_tensors, masks):
+                    # Ensure each mask is 2D
+                    if mask.ndim == 3:
+                        if mask.shape[2] == 3:
+                            mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
+                        else:
+                            mask = mask[:, :, 0]
                     mask_tensor = torch.from_numpy(mask).float() / 255.0
                     mask_tensor = mask_tensor.unsqueeze(0).unsqueeze(0).to(device)
                     frame_on_device = frame_tensor.unsqueeze(0).to(device)