sam-vit-base-with-handler

@@ -5,7 +5,7 @@ import base64
 import numpy as np
 from PIL import Image
 import torch
-from transformers import SamModel, SamImageProcessor
 from typing import Dict, List, Any
 import torch.nn.functional as F
@@ -20,20 +20,84 @@ class EndpointHandler():
         """
         try:
             # Load the model and processor from the local path
-            self.model = SamModel.from_pretrained(path).to(device)
-            self.processor = SamImageProcessor.from_pretrained(path, do_resize=False)
         except Exception as e:
             # Fallback to loading from a known SAM model if local loading fails
-            print("Failed to load from local path: {}".format(e))
             print("Attempting to load from facebook/sam-vit-base")
-            self.model = SamModel.from_pretrained("facebook/sam-vit-base").to(device)
-            self.processor = SamImageProcessor.from_pretrained("facebook/sam-vit-base", do_resize=False)
     def __call__(self, data):
         """
         Called on every HTTP request.
         Handles both base64-encoded images and PIL images.
-        Returns a PIL Image object.
         """
         # 1. Parse and decode the input image
         inputs = data.pop("inputs", None)
@@ -51,69 +115,117 @@ class EndpointHandler():
         else:
             raise TypeError("Unsupported input type. Expected a PIL Image or a base64 encoded string.")
-        # 2. Prepare prompts and process the image
-        # height, width = img.size[1], img.size[0]
-        # input_points = [[[width // 2, height // 2]]]
-        # input_labels = [[1]]
-        inputs = self.processor(img, return_tensors="pt").to(device)
-        # 3. Generate masks
-        with torch.no_grad():
-            outputs = self.model(**inputs)
-        # 4. Process and select the best mask
         try:
             # Get predicted masks and scores
-            predicted_masks = outputs.pred_masks.cpu()
-            iou_scores = outputs.iou_scores.cpu()[0]
-            # Handle different tensor dimensions
-            if predicted_masks.ndim == 5:
-                predicted_masks = predicted_masks.squeeze(1)
-            # Select the best mask
-            best_mask_idx = torch.argmax(iou_scores)
-            best_mask = predicted_masks[0, best_mask_idx, :, :]
-            # Convert to binary mask (no resizing needed since processor doesn't resize)
-            mask_binary = (best_mask > 0.0).numpy().astype(np.uint8) * 255
         except Exception as e:
-            print("Error processing masks: {}".format(e))
-            # Fallback: create a simple mask
-            height, width = img.size[1], img.size[0]
             mask_binary = np.zeros((height, width), dtype=np.uint8)
             center_x, center_y = width // 2, height // 2
             size = min(width, height) // 8
-            mask_binary[center_y-size:center_y+size, center_x-size:center_x+size] = 255
-        # 5. Create and return the output PIL Image
-        output_img = Image.fromarray(mask_binary)
-        return [{'score': None, 'label': 'everything', 'mask': output_img}]
 def main():
     # This main function shows how a client would call the endpoint locally.
     input_path = "/Users/rp7/Downloads/test.jpeg"
-    output_path = "output.png"
     # 1. Prepare the payload with a base64-encoded image string
     with open(input_path, "rb") as f:
         img_bytes = f.read()
     img_b64 = base64.b64encode(img_bytes).decode("utf-8")
-    payload = {"inputs": "data:image/jpeg;base64,{}".format(img_b64)}
-    # 2. Instantiate handler and get the PIL Image result
     handler = EndpointHandler(path=".")
-    result = handler(payload)
-    # 3. Extract the image from the result and save it
-    if result and isinstance(result, list) and 'mask' in result[0]:
-        result_img = result[0]['mask']
-        result_img.save(output_path)
-        print("Wrote mask to {}".format(output_path))
     else:
-        print("Failed to get a valid mask from the handler.")
 if __name__ == "__main__":
     main()

 import numpy as np
 from PIL import Image
 import torch
+from transformers import SamModel, SamProcessor
 from typing import Dict, List, Any
 import torch.nn.functional as F
         """
         try:
             # Load the model and processor from the local path
+            self.model = SamModel.from_pretrained(path).to(device).eval()
+            # Load processor with do_resize=False to avoid resizing
+            self.processor = SamProcessor.from_pretrained(path)
+            # Override the processor's image processor to disable resizing
+            self.processor.image_processor.do_resize = False
+            self.processor.image_processor.do_rescale = True
+            self.processor.image_processor.do_normalize = True
         except Exception as e:
             # Fallback to loading from a known SAM model if local loading fails
+            print(f"Failed to load from local path: {e}")
             print("Attempting to load from facebook/sam-vit-base")
+            self.model = SamModel.from_pretrained("facebook/sam-vit-base").to(device).eval()
+            self.processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+            # Override the processor's image processor to disable resizing
+            self.processor.image_processor.do_resize = False
+            self.processor.image_processor.do_rescale = True
+            self.processor.image_processor.do_normalize = True
+    def generate_grid_points(self, width, height, points_per_side=32):
+        """Generate a grid of points across the image for comprehensive segmentation."""
+        points = []
+        labels = []
+        # Create a grid of points
+        x_coords = np.linspace(0, width - 1, points_per_side, dtype=int)
+        y_coords = np.linspace(0, height - 1, points_per_side, dtype=int)
+        for y in y_coords:
+            for x in x_coords:
+                points.append([x, y])
+                labels.append(1)  # foreground point
+        return [points], [labels]
+    def filter_masks(self, masks, iou_scores, score_threshold=0.88, stability_score_threshold=0.95):
+        """Filter masks based on quality scores and remove duplicates."""
+        filtered_masks = []
+        filtered_scores = []
+        for i, (mask, score) in enumerate(zip(masks, iou_scores)):
+            if score > score_threshold:
+                # Calculate stability score (measure of mask quality)
+                mask_binary = mask > 0.0
+                stability_score = self.calculate_stability_score(mask_binary)
+                if stability_score > stability_score_threshold:
+                    filtered_masks.append(mask)
+                    filtered_scores.append(score.item())
+        return filtered_masks, filtered_scores
+    def calculate_stability_score(self, mask):
+        """Calculate stability score for a mask."""
+        # Simple stability score based on mask coherence
+        mask_float = mask.float()
+        # Calculate the ratio of the mask area to its bounding box area
+        mask_area = torch.sum(mask_float)
+        if mask_area == 0:
+            return 0.0
+        # Find bounding box
+        coords = torch.nonzero(mask_float)
+        if len(coords) == 0:
+            return 0.0
+        min_y, min_x = torch.min(coords, dim=0)[0]
+        max_y, max_x = torch.max(coords, dim=0)[0]
+        bbox_area = (max_y - min_y + 1) * (max_x - min_x + 1)
+        stability = mask_area / bbox_area if bbox_area > 0 else 0.0
+        return stability.item()
     def __call__(self, data):
         """
         Called on every HTTP request.
         Handles both base64-encoded images and PIL images.
+        Returns a list of segment masks.
         """
         # 1. Parse and decode the input image
         inputs = data.pop("inputs", None)
         else:
             raise TypeError("Unsupported input type. Expected a PIL Image or a base64 encoded string.")
+        # 2. Get image dimensions
+        width, height = img.size
+        # 3. Generate grid points for comprehensive segmentation
+        input_points, input_labels = self.generate_grid_points(width, height, points_per_side=16)
+        # 4. Process the image and points
+        inputs = self.processor(
+            img,
+            input_points=input_points,
+            input_labels=input_labels,
+            return_tensors="pt"
+        ).to(device)
+        # 5. Generate masks
+        all_masks = []
+        all_scores = []
         try:
+            with torch.no_grad():
+                outputs = self.model(**inputs)
             # Get predicted masks and scores
+            predicted_masks = outputs.pred_masks.cpu()  # Shape: [batch, num_queries, num_masks_per_query, H, W]
+            iou_scores = outputs.iou_scores.cpu()       # Shape: [batch, num_queries, num_masks_per_query]
+            # Process masks from all queries
+            batch_size, num_queries, num_masks_per_query = predicted_masks.shape[:3]
+            for query_idx in range(num_queries):
+                query_masks = predicted_masks[0, query_idx]  # [num_masks_per_query, H, W]
+                query_scores = iou_scores[0, query_idx]      # [num_masks_per_query]
+                # Select best mask for this query
+                best_mask_idx = torch.argmax(query_scores)
+                if query_scores[best_mask_idx] > 0.5:  # Only keep high-quality masks
+                    best_mask = query_masks[best_mask_idx]
+                    all_masks.append(best_mask)
+                    all_scores.append(query_scores[best_mask_idx])
+            # Filter and deduplicate masks
+            if all_masks:
+                filtered_masks, filtered_scores = self.filter_masks(all_masks, all_scores)
+            else:
+                filtered_masks, filtered_scores = [], []
         except Exception as e:
+            print(f"Error processing masks: {e}")
+            # Fallback: create a simple center mask
             mask_binary = np.zeros((height, width), dtype=np.uint8)
             center_x, center_y = width // 2, height // 2
             size = min(width, height) // 8
+            y_start, y_end = max(0, center_y-size), min(height, center_y+size)
+            x_start, x_end = max(0, center_x-size), min(width, center_x+size)
+            mask_binary[y_start:y_end, x_start:x_end] = 255
+            output_img = Image.fromarray(mask_binary)
+            return [{'score': 0.5, 'label': 'fallback_segment', 'mask': output_img}]
+        # 6. Convert masks to PIL Images and prepare results
+        results = []
+        for i, (mask, score) in enumerate(zip(filtered_masks, filtered_scores)):
+            # Convert to binary mask
+            mask_binary = (mask > 0.0).numpy().astype(np.uint8) * 255
+            # Create PIL Image
+            output_img = Image.fromarray(mask_binary)
+            results.append({
+                'score': float(score),
+                'label': f'segment_{i}',
+                'mask': output_img
+            })
+        # If no segments found, return a fallback
+        if not results:
+            mask_binary = np.zeros((height, width), dtype=np.uint8)
+            output_img = Image.fromarray(mask_binary)
+            results.append({'score': 0.0, 'label': 'no_segments', 'mask': output_img})
+        return results
 def main():
     # This main function shows how a client would call the endpoint locally.
     input_path = "/Users/rp7/Downloads/test.jpeg"
+    output_dir = "output_masks"
+    # Create output directory
+    import os
+    os.makedirs(output_dir, exist_ok=True)
     # 1. Prepare the payload with a base64-encoded image string
     with open(input_path, "rb") as f:
         img_bytes = f.read()
     img_b64 = base64.b64encode(img_bytes).decode("utf-8")
+    payload = {"inputs": f"data:image/jpeg;base64,{img_b64}"}
+    # 2. Instantiate handler and get the result
     handler = EndpointHandler(path=".")
+    results = handler(payload)
+    # 3. Save all masks
+    if results and isinstance(results, list):
+        print(f"Found {len(results)} segments")
+        for i, result in enumerate(results):
+            if 'mask' in result:
+                output_path = os.path.join(output_dir, f"segment_{i}_score_{result['score']:.3f}.png")
+                result['mask'].save(output_path)
+                print(f"Saved {result['label']} (score: {result['score']:.3f}) to {output_path}")
     else:
+        print("Failed to get valid masks from the handler.")
 if __name__ == "__main__":
     main()