Initial upload of VINE model architecture and weights - model and weights

Browse files

Files changed (5) hide show

config.json +6 -2
flattening.py +124 -0
model.safetensors +3 -0
vine_model.py +702 -0
vis_utils.py +941 -0

config.json CHANGED Viewed

@@ -1,9 +1,12 @@
 {
-  "_attn_implementation_autoset": true,
   "_device": "cuda",
   "alpha": 0.5,
   "auto_map": {
-    "AutoConfig": "vine_config.VineConfig"
   },
   "bbox_min_dim": 5,
   "box_threshold": 0.35,
@@ -26,6 +29,7 @@
   "target_fps": 1,
   "text_threshold": 0.25,
   "topk_cate": 3,
   "transformers_version": "4.46.2",
   "use_hf_repo": true,
   "visualization_dir": null,

 {
   "_device": "cuda",
   "alpha": 0.5,
+  "architectures": [
+    "VineModel"
+  ],
   "auto_map": {
+    "AutoConfig": "vine_config.VineConfig",
+    "AutoModel": "vine_model.VineModel"
   },
   "bbox_min_dim": 5,
   "box_threshold": 0.35,
   "target_fps": 1,
   "text_threshold": 0.25,
   "topk_cate": 3,
+  "torch_dtype": "float32",
   "transformers_version": "4.46.2",
   "use_hf_repo": true,
   "visualization_dir": null,

flattening.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from __future__ import annotations
+from collections import defaultdict
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
+import numpy as np
+import torch
+MaskType = Union[np.ndarray, torch.Tensor]
+def _to_numpy_mask(mask: MaskType) -> np.ndarray:
+    """
+    Convert assorted mask formats to a 2D numpy boolean array.
+    """
+    if isinstance(mask, torch.Tensor):
+        mask_np = mask.detach().cpu().numpy()
+    else:
+        mask_np = np.asarray(mask)
+    # Remove singleton dimensions at the front/back
+    while mask_np.ndim > 2 and mask_np.shape[0] == 1:
+        mask_np = np.squeeze(mask_np, axis=0)
+    if mask_np.ndim > 2 and mask_np.shape[-1] == 1:
+        mask_np = np.squeeze(mask_np, axis=-1)
+    if mask_np.ndim != 2:
+        raise ValueError(f"Expected mask to be 2D after squeezing, got shape {mask_np.shape}")
+    return mask_np.astype(bool)
+def _mask_to_bbox(mask: np.ndarray) -> Optional[Tuple[int, int, int, int]]:
+    """
+    Compute a bounding box for a 2D boolean mask.
+    """
+    if not mask.any():
+        return None
+    rows, cols = np.nonzero(mask)
+    y_min, y_max = rows.min(), rows.max()
+    x_min, x_max = cols.min(), cols.max()
+    return x_min, y_min, x_max, y_max
+def flatten_segments_for_batch(
+    video_id: int,
+    segments: Dict[int, Dict[int, MaskType]],
+    bbox_min_dim: int = 5,
+) -> Dict[str, List]:
+    """
+    Flatten nested segmentation data into batched lists suitable for predicate
+    models or downstream visualizations. Mirrors the notebook helper but is
+    robust to differing mask dtypes/shapes.
+    """
+    batched_object_ids: List[Tuple[int, int, int]] = []
+    batched_masks: List[np.ndarray] = []
+    batched_bboxes: List[Tuple[int, int, int, int]] = []
+    frame_pairs: List[Tuple[int, int, Tuple[int, int]]] = []
+    for frame_id, frame_objects in segments.items():
+        valid_objects: List[int] = []
+        for object_id, raw_mask in frame_objects.items():
+            mask = _to_numpy_mask(raw_mask)
+            bbox = _mask_to_bbox(mask)
+            if bbox is None:
+                continue
+            x_min, y_min, x_max, y_max = bbox
+            if abs(y_max - y_min) < bbox_min_dim or abs(x_max - x_min) < bbox_min_dim:
+                continue
+            valid_objects.append(object_id)
+            batched_object_ids.append((video_id, frame_id, object_id))
+            batched_masks.append(mask)
+            batched_bboxes.append(bbox)
+        for i in valid_objects:
+            for j in valid_objects:
+                if i == j:
+                    continue
+                frame_pairs.append((video_id, frame_id, (i, j)))
+    return {
+        "object_ids": batched_object_ids,
+        "masks": batched_masks,
+        "bboxes": batched_bboxes,
+        "pairs": frame_pairs,
+    }
+def extract_valid_object_pairs(
+    batched_object_ids: Sequence[Tuple[int, int, int]],
+    interested_object_pairs: Optional[Iterable[Tuple[int, int]]] = None,
+) -> List[Tuple[int, int, Tuple[int, int]]]:
+    """
+    Filter object pairs per frame. If `interested_object_pairs` is provided, only
+    emit those combinations when both objects are present; otherwise emit all
+    permutations (i, j) with i != j for each frame.
+    """
+    frame_to_objects: Dict[Tuple[int, int], set] = defaultdict(set)
+    for vid, fid, oid in batched_object_ids:
+        frame_to_objects[(vid, fid)].add(oid)
+    interested = (
+        list(interested_object_pairs)
+        if interested_object_pairs is not None
+        else None
+    )
+    valid_pairs: List[Tuple[int, int, Tuple[int, int]]] = []
+    for (vid, fid), object_ids in frame_to_objects.items():
+        if interested:
+            for src, dst in interested:
+                if src in object_ids and dst in object_ids:
+                    valid_pairs.append((vid, fid, (src, dst)))
+        else:
+            for src in object_ids:
+                for dst in object_ids:
+                    if src == dst:
+                        continue
+                    valid_pairs.append((vid, fid, (src, dst)))
+    return valid_pairs

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c91c273c5f61b7f17fc6cc265e14bb78ed134c71d7b54611208420fcbe4f81de
+size 1815491340

vine_model.py ADDED Viewed

	@@ -0,0 +1,702 @@

+from flax import config
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from transformers import PreTrainedModel, AutoTokenizer, AutoModel, AutoProcessor
+from typing import Dict, List, Tuple, Optional, Any, Union
+import numpy as np
+import os
+import cv2
+from collections import defaultdict
+import builtins
+import sys
+from laser.models import llava_clip_model_v3
+sys.modules["llava_clip_model_v3"] = llava_clip_model_v3
+from safetensors.torch import load_file
+import inspect
+from transformers.models.clip import modeling_clip
+import transformers
+from huggingface_hub import snapshot_download
+from .vine_config import VineConfig
+from laser.models.model_utils import (
+    extract_single_object,
+    extract_object_subject,
+    crop_image_contain_bboxes,
+    segment_list
+)
+from .flattening import (
+    extract_valid_object_pairs,
+    flatten_segments_for_batch,
+)
+from .vis_utils import save_mask_one_image
+class VineModel(PreTrainedModel):
+    """
+    VINE (Video Understanding with Natural Language) Model
+    This model processes videos along with categorical, unary, and binary keywords
+    to return probability distributions over those keywords for detected objects
+    and their relationships in the video.
+    """
+    config_class = VineConfig
+    def __init__(self, config: VineConfig):
+        super().__init__(config)
+        self.config = config
+        self.visualize = getattr(config, "visualize", False)
+        self.visualization_dir = getattr(config, "visualization_dir", None)
+        self.debug_visualizations = getattr(config, "debug_visualizations", False)
+        self._device = getattr(config, "_device")
+        # Initialize CLIP components
+        self.clip_tokenizer = AutoTokenizer.from_pretrained(config.model_name)
+        if self.clip_tokenizer.pad_token is None:
+            self.clip_tokenizer.pad_token = (
+                self.clip_tokenizer.unk_token
+                if self.clip_tokenizer.unk_token
+                else self.clip_tokenizer.eos_token
+            )
+        self.clip_processor = AutoProcessor.from_pretrained(config.model_name)
+        self.clip_cate_model = AutoModel.from_pretrained(config.model_name)
+        self.clip_unary_model = AutoModel.from_pretrained(config.model_name)
+        self.clip_binary_model = AutoModel.from_pretrained(config.model_name)
+        # Then try to load pretrained VINE weights if specified
+        if config.use_hf_repo:
+            self._load_huggingface_vine_weights(config.model_repo, config.model_file)
+        else:
+            self._load_local_pretrained_vine_weights(config.local_dir, config.local_filename)
+        # Move models to devicexwxw
+        self.to(self._device)
+    def _load_huggingface_vine_weights(self, model_repo: str, model_file: Optional[str] = None):
+        """
+        Load pretrained VINE weights from HuggingFace Hub.
+        """
+        try:
+            print(f"Loading VINE weights from HuggingFace repo: {model_repo}")
+            repo_path = snapshot_download(model_repo, revision=model_file or "main")
+            weights = load_file(os.path.join(repo_path, "model.safetensors"))
+            self.load_state_dict(weights, strict=False)
+            print("✓ Successfully loaded VINE weights from HuggingFace Hub")
+            return True
+        except Exception as e:
+            print(f"✗ Error loading VINE weights from HuggingFace Hub: {e}")
+            print("Using base CLIP models instead")
+            return False
+    def _load_local_pretrained_vine_weights(self, local_dir: str, local_filename: Optional[str] = None, epoch: int = 0):
+        """
+        Load pretrained VINE weights from a saved .pt file or ensemble format.
+        """
+        #try:            # simple .pt or .pth checkpoint
+        # x = torch.load(pretrained_path, map_location=self._device, weights_only=False)
+        # print(f"Loaded VINE checkpoint type: {type(x)}")
+        full_path = os.path.join(local_dir, local_filename) if local_filename else local_dir
+        if full_path.endswith(".pkl"):
+            print(f"Loading VINE weights from: {full_path}")
+            loaded_vine_model = torch.load(full_path, map_location=self._device, weights_only=False)
+            print(f"Loaded state type: {type(loaded_vine_model)}")
+            if not isinstance(loaded_vine_model, dict):
+                if hasattr(loaded_vine_model, 'clip_cate_model'):
+                    self.clip_cate_model.load_state_dict(loaded_vine_model.clip_cate_model.state_dict())
+                if hasattr(loaded_vine_model, 'clip_unary_model'):
+                    self.clip_unary_model.load_state_dict(loaded_vine_model.clip_unary_model.state_dict())
+                if hasattr(loaded_vine_model, 'clip_binary_model'):
+                    self.clip_binary_model.load_state_dict(loaded_vine_model.clip_binary_model.state_dict())
+                return True
+        elif full_path.endswith(".pt") or full_path.endswith(".pth"):
+            state = torch.load(full_path, map_location=self._device, weights_only=True)
+            print(f"Loaded state type: {type(state)}")
+            self.load_state_dict(state)
+            return True
+        #  handle directory + epoch format
+        if os.path.isdir(full_path):
+            model_files = [f for f in os.listdir(full_path) if f.endswith(f'.{epoch}.model')]
+            if model_files:
+                model_file = os.path.join(full_path, model_files[0])
+                print(f"Loading VINE weights from: {model_file}")
+                pretrained_model = torch.load(model_file, map_location="cpu")
+                # Conversion from PredicateModel-like object to VineModel
+                # Only copy if attributes exist
+                if hasattr(pretrained_model, 'clip_cate_model'):
+                    self.clip_cate_model.load_state_dict(pretrained_model.clip_cate_model.state_dict())
+                if hasattr(pretrained_model, 'clip_unary_model'):
+                    self.clip_unary_model.load_state_dict(pretrained_model.clip_unary_model.state_dict())
+                if hasattr(pretrained_model, 'clip_binary_model'):
+                    self.clip_binary_model.load_state_dict(pretrained_model.clip_binary_model.state_dict())
+                print("✓ Loaded all sub-model weights from ensemble format")
+                return True
+            else:
+                print(f"No model file found for epoch {epoch} in {full_path}")
+                return False
+        print("Unsupported format for pretrained_vine_path")
+        return False
+        # except Exception as e:
+        #     print(f"✗ Error loading VINE weights: {e}")
+        #     print("Using base CLIP models instead")
+        #     return False
+    # def _load_pretrained_vine_weights(self, pretrained_path: str, epoch: int = 0):
+    #     """
+    #     Load pretrained VINE weights from local ensemble format.
+    #     Args:
+    #         pretrained_path: Path to the pretrained model directory or HF model name
+    #         epoch: Epoch number to load (for ensemble format)
+    #     """
+    #     if pretrained_path == "video-fm/vine_v0":
+    #         # Try to load from HuggingFace Hubtry:
+    #         # ✅ TODO FIXED: Added support for loading .pt/.pth checkpoints with state dicts
+    #         if pretrained_path.endswith(".pt") or pretrained_path.endswith(".pth"):
+    #             print(f"Loading VINE weights from: {pretrained_path}")
+    #             state = torch.load(pretrained_path, map_location="cpu")
+    #             if "clip_cate_model" in state:
+    #                 self.clip_cate_model.load_state_dict(state["clip_cate_model"])
+    #                 print("✓ Loaded categorical model weights")
+    #             if "clip_unary_model" in state:
+    #                 self.clip_unary_model.load_state_dict(state["clip_unary_model"])
+    #                 print("✓ Loaded unary model weights")
+    #             if "clip_binary_model" in state:
+    #                 self.clip_binary_model.load_state_dict(state["clip_binary_model"])
+    #                 print("✓ Loaded binary model weights")
+    #             if "clip_tokenizer" in state:
+    #                 self.clip_tokenizer = state["clip_tokenizer"]
+    #                 print("✓ Loaded tokenizer")
+    #             if "clip_processor" in state:
+    #                 self.clip_processor = state["clip_processor"]
+    #                 print("✓ Loaded processor")
+    #             print("✓ All VINE weights loaded successfully")
+    #             return True
+    #     # Load from local ensemble format
+    #     try:
+    #         if os.path.isdir(pretrained_path):
+    #             # Directory format - look for ensemble file
+    #             model_files = [f for f in os.listdir(pretrained_path) if f.endswith(f'.{epoch}.model')]
+    #             if model_files:
+    #                 model_file = os.path.join(pretrained_path, model_files[0])
+    #             else:
+    #                 print(f"No model file found for epoch {epoch} in {pretrained_path}")
+    #                 return False
+    #         else:
+    #             # Direct file path
+    #             model_file = pretrained_path
+    #         print(f"Loading VINE weights from: {model_file}")
+    #         # Load the ensemble model (PredicateModel instance)
+    #         # TODO: conversion from PredicateModel to VineModel
+    #         pretrained_model = torch.load(model_file, map_location='cpu', weights_only=False)
+    #         # Transfer weights from the pretrained model to our HuggingFace models
+    #         if hasattr(pretrained_model, 'clip_cate_model'):
+    #             self.clip_cate_model.load_state_dict(pretrained_model.clip_cate_model.state_dict())
+    #             print("✓ Loaded categorical model weights")
+    #         if hasattr(pretrained_model, 'clip_unary_model'):
+    #             self.clip_unary_model.load_state_dict(pretrained_model.clip_unary_model.state_dict())
+    #             print("✓ Loaded unary model weights")
+    #         if hasattr(pretrained_model, 'clip_binary_model'):
+    #             self.clip_binary_model.load_state_dict(pretrained_model.clip_binary_model.state_dict())
+    #             print("✓ Loaded binary model weights")
+    #         # Also transfer tokenizer and processor if available
+    #         if hasattr(pretrained_model, 'clip_tokenizer'):
+    #             self.clip_tokenizer = pretrained_model.clip_tokenizer
+    #             print("✓ Loaded tokenizer")
+    #         if hasattr(pretrained_model, 'clip_processor'):
+    #             self.clip_processor = pretrained_model.clip_processor
+    #             print("✓ Loaded processor")
+    #         print("✓ Successfully loaded all VINE weights")
+    #         return True
+    #     except Exception as e:
+    #         print(f"✗ Error loading VINE weights: {e}")
+    #         print("Using base CLIP models instead")
+    #         return False
+    @classmethod
+    def from_pretrained_vine(
+        cls,
+        model_path: str,
+        config: Optional[VineConfig] = None,
+        epoch: int = 0,
+        **kwargs
+    ):
+        """
+        Create VineModel from pretrained VINE weights.
+        Args:
+            model_path: Path to pretrained VINE model
+            config: Optional config, will create default if None
+            epoch: Epoch number to load
+            **kwargs: Additional arguments
+        Returns:
+            VineModel instance with loaded weights
+        """
+        # Normalize the incoming model_path into the new VineConfig fields.
+        if config is None:
+            # Heuristics: if path looks like a HF repo (contains a "/" and
+            # doesn't exist on disk) treat it as a repo. Otherwise treat as local.
+            if model_path and ("/" in model_path and not os.path.exists(model_path)):
+                config = VineConfig(use_hf_repo=True, model_repo=model_path)
+            else:
+                # Local path: could be a file or directory
+                if os.path.isdir(model_path):
+                    config = VineConfig(use_hf_repo=False, local_dir=model_path)
+                else:
+                    config = VineConfig(
+                        use_hf_repo=False,
+                        local_dir=os.path.dirname(model_path) or None,
+                        local_filename=os.path.basename(model_path) or None,
+                    )
+        else:
+            # Update provided config to reflect the requested pretrained path
+            if model_path and ("/" in model_path and not os.path.exists(model_path)):
+                config.use_hf_repo = True
+                config.model_repo = model_path
+                config.model_file = None
+                config.local_dir = None
+                config.local_filename = None
+            else:
+                config.use_hf_repo = False
+                if os.path.isdir(model_path):
+                    config.local_dir = model_path
+                    config.local_filename = None
+                else:
+                    config.local_dir = os.path.dirname(model_path) or None
+                    config.local_filename = os.path.basename(model_path) or None
+        # Create model instance (will automatically load weights)
+        model = cls(config, **kwargs)
+        return model
+    def _text_features_checkpoint(self, model, tokens):
+        """Extract text features with gradient checkpointing."""
+        token_keys = list(tokens.keys())
+        def get_text_features_wrapped(*inputs):
+            kwargs = {key: value for key, value in zip(token_keys, inputs)}
+            return model.get_text_features(**kwargs)
+        token_values = [tokens[key] for key in token_keys]
+        return cp.checkpoint(get_text_features_wrapped, *token_values, use_reentrant=False)
+    def _image_features_checkpoint(self, model, images):
+        """Extract image features with gradient checkpointing."""
+        return cp.checkpoint(model.get_image_features, images, use_reentrant=False)
+    def clip_sim(self, model, nl_feat, img_feat):
+        img_feat = img_feat / img_feat.norm(p=2, dim=-1, keepdim=True)
+        nl_feat = nl_feat / nl_feat.norm(p=2, dim=-1, keepdim=True)
+        logits = torch.matmul(img_feat, nl_feat.T)
+        if hasattr(model, "logit_scale"):
+            logits = logits * model.logit_scale.exp()
+        return logits
+    def forward(
+        self,
+        video_frames: torch.Tensor,
+        masks: Dict[int, Dict[int, torch.Tensor]],
+        bboxes: Dict[int, Dict[int, List]],
+        categorical_keywords: List[str],
+        unary_keywords: Optional[List[str]] = None,
+        binary_keywords: Optional[List[str]] = None,
+        object_pairs: Optional[List[Tuple[int, int]]] = None,
+        return_flattened_segments: Optional[bool] = None,
+        return_valid_pairs: Optional[bool] = None,
+        interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
+        debug_visualizations: Optional[bool] = None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Forward pass of the VINE model.
+        Args:
+            video_frames: Tensor of shape (num_frames, height, width, 3)
+            masks: Dict mapping frame_id -> object_id -> mask tensor
+            bboxes: Dict mapping frame_id -> object_id -> [x1, y1, x2, y2]
+            categorical_keywords: List of category names to classify objects
+            unary_keywords: Optional list of unary predicates (actions on single objects)
+            binary_keywords: Optional list of binary predicates (relations between objects)
+            object_pairs: Optional list of (obj1_id, obj2_id) pairs for binary classification
+        Returns:
+            Dict containing probability distributions for categorical, unary, and binary predictions
+        """
+        if unary_keywords is None:
+            unary_keywords = []
+        if binary_keywords is None:
+            binary_keywords = []
+        if object_pairs is None:
+            object_pairs = []
+        if return_flattened_segments is None:
+            return_flattened_segments = self.config.return_flattened_segments
+        if return_valid_pairs is None:
+            return_valid_pairs = self.config.return_valid_pairs
+        if interested_object_pairs is None or len(interested_object_pairs) == 0:
+            interested_object_pairs = getattr(self.config, "interested_object_pairs", []) or []
+        if debug_visualizations is None:
+            debug_visualizations = self.debug_visualizations
+        # Prepare dummy strings for empty categories
+        dummy_str = ""
+        # Fill empty categories with dummy strings
+        if len(categorical_keywords) == 0:
+            categorical_keywords = [dummy_str]
+        if len(unary_keywords) == 0:
+            unary_keywords = [dummy_str]
+        if len(binary_keywords) == 0:
+            binary_keywords = [dummy_str]
+        # Extract text features for all keyword types
+        categorical_features = self._extract_text_features(
+            self.clip_cate_model, categorical_keywords
+        )
+        unary_features = self._extract_text_features(
+            self.clip_unary_model, unary_keywords
+        )
+        binary_features = self._extract_text_features(
+            self.clip_binary_model, binary_keywords
+        )
+        # Process video frames and extract object features
+        categorical_probs = {}
+        unary_probs = {}
+        binary_probs = {}
+        # Process each frame
+        for frame_id, frame_masks in masks.items():
+            if frame_id >= len(video_frames):
+                continue
+            frame = self._frame_to_numpy(video_frames[frame_id])
+            frame_bboxes = bboxes.get(frame_id, {})
+            # Extract object features for categorical classification
+            for obj_id, mask in frame_masks.items():
+                if obj_id not in frame_bboxes:
+                    continue
+                bbox = frame_bboxes[obj_id]
+                # Extract single object image
+                mask_np = self._mask_to_numpy(mask)
+                obj_image = extract_single_object(
+                    frame, mask_np, alpha=self.config.alpha
+                )
+                # Get image features
+                obj_features = self._extract_image_features(
+                    self.clip_cate_model, obj_image
+                )
+                # Compute similarities for categorical classification
+                cat_similarities = self.clip_sim(
+                    self.clip_cate_model, categorical_features, obj_features
+                )
+                cat_probs = F.softmax(cat_similarities, dim=-1)
+                # Store categorical predictions
+                for i, keyword in enumerate(categorical_keywords):
+                    if keyword != dummy_str:
+                        categorical_probs[(obj_id, keyword)] = cat_probs[0, i].item()
+                # Compute unary predictions
+                if len(unary_keywords) > 0 and unary_keywords[0] != dummy_str:
+                    unary_similarities = self.clip_sim(
+                        self.clip_unary_model, unary_features, obj_features
+                    )
+                    unary_probs_tensor = F.softmax(unary_similarities, dim=-1)
+                    for i, keyword in enumerate(unary_keywords):
+                        if keyword != dummy_str:
+                            unary_probs[(frame_id, obj_id, keyword)] = unary_probs_tensor[0, i].item()
+        # Process binary relationships
+        if len(binary_keywords) > 0 and binary_keywords[0] != dummy_str and len(object_pairs) > 0:
+            for obj1_id, obj2_id in object_pairs:
+                for frame_id, frame_masks in masks.items():
+                    if frame_id >= len(video_frames):
+                        continue
+                    if (obj1_id in frame_masks and obj2_id in frame_masks and
+                        obj1_id in bboxes.get(frame_id, {}) and obj2_id in bboxes.get(frame_id, {})):
+                        frame = self._frame_to_numpy(video_frames[frame_id])
+                        mask1 = frame_masks[obj1_id]
+                        mask2 = frame_masks[obj2_id]
+                        mask1_np = self._mask_to_numpy(mask1)
+                        mask2_np = self._mask_to_numpy(mask2)
+                        # Extract object pair image
+                        pair_image = extract_object_subject(
+                            frame, mask1_np[..., None], mask2_np[..., None],
+                            alpha=self.config.alpha,
+                            white_alpha=self.config.white_alpha
+                        )
+                        # Crop to contain both objects
+                        bbox1 = bboxes[frame_id][obj1_id]
+                        bbox2 = bboxes[frame_id][obj2_id]
+                        # Bounding box overlap check
+                        if bbox1[0] >= bbox2[2] or bbox2[1] >= bbox1[3] or \
+                           bbox2[0] >= bbox1[2] or bbox1[1] >= bbox2[3]:
+                            continue
+                        cropped_image = crop_image_contain_bboxes(
+                            pair_image, [bbox1, bbox2], f"frame_{frame_id}"
+                        )
+                        # Get image features
+                        pair_features = self._extract_image_features(
+                            self.clip_binary_model, cropped_image
+                        )
+                        # Compute similarities for binary classification
+                        binary_similarities = self.clip_sim(
+                            self.clip_binary_model, binary_features, pair_features
+                        )
+                        binary_probs_tensor = F.softmax(binary_similarities, dim=-1)
+                        for i, keyword in enumerate(binary_keywords):
+                            if keyword != dummy_str:
+                                binary_probs[(frame_id, (obj1_id, obj2_id), keyword)] = binary_probs_tensor[0, i].item()
+        # Calculate dummy probability (for compatibility)
+        dummy_prob = 1.0 / max(len(categorical_keywords), len(unary_keywords), len(binary_keywords))
+        result: Dict[str, Any] = {
+            "categorical_probs": {0: categorical_probs},  # Video ID 0
+            "unary_probs": {0: unary_probs},
+            "binary_probs": [binary_probs],  # List format for compatibility
+            "dummy_prob": dummy_prob
+        }
+        if return_flattened_segments or return_valid_pairs:
+            flattened = flatten_segments_for_batch(
+                video_id=0,
+                segments=masks,
+                bbox_min_dim=self.config.bbox_min_dim,
+            )
+            if return_flattened_segments:
+                result["flattened_segments"] = flattened
+            if return_valid_pairs:
+                interested_pairs = interested_object_pairs if interested_object_pairs else None
+                result["valid_pairs"] = extract_valid_object_pairs(
+                    flattened["object_ids"],
+                    interested_pairs,
+                )
+                if interested_pairs is None:
+                    # Provide all generated pairs for clarity when auto-generated.
+                    result["valid_pairs_metadata"] = {"pair_source": "all_pairs"}
+                else:
+                    result["valid_pairs_metadata"] = {"pair_source": "filtered", "requested_pairs": interested_pairs}
+        return result
+    def _frame_to_numpy(self, frame: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
+        """Convert a frame tensor/array to a contiguous numpy array."""
+        if torch.is_tensor(frame):
+            frame_np = frame.detach().cpu().numpy()
+        else:
+            frame_np = np.asarray(frame)
+        return np.ascontiguousarray(frame_np)
+    def _mask_to_numpy(self, mask: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
+        """Convert a mask tensor/array to a 2D boolean numpy array."""
+        if torch.is_tensor(mask):
+            mask_np = mask.detach().cpu().numpy()
+        else:
+            mask_np = np.asarray(mask)
+        if mask_np.ndim == 3:
+            if mask_np.shape[0] == 1:
+                mask_np = mask_np.squeeze(0)
+            elif mask_np.shape[2] == 1:
+                mask_np = mask_np.squeeze(2)
+        if mask_np.ndim != 2:
+            raise ValueError(f"Mask must be 2D after squeezing, got shape {mask_np.shape}")
+        return mask_np.astype(bool, copy=False)
+    def _extract_text_features(self, model, keywords):
+        """Extract text features for given keywords."""
+        tokens = self.clip_tokenizer(
+            keywords,
+            return_tensors="pt",
+            max_length=75,
+            truncation=True,
+            padding='max_length'
+        ).to(self._device)
+        return self._text_features_checkpoint(model, tokens)
+    def _extract_image_features(self, model, image):
+        """Extract image features for given image."""
+        # Ensure image is in correct format
+        if isinstance(image, np.ndarray):
+            if image.dtype != np.uint8:
+                image = image.astype(np.uint8)
+            # Convert BGR to RGB if needed
+            if len(image.shape) == 3 and image.shape[2] == 3:
+                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Process image with CLIP processor
+        inputs = self.clip_processor(
+            images=image,
+            return_tensors="pt"
+        ).to(self._device)
+        return self._image_features_checkpoint(model, inputs['pixel_values'])
+    #TODO: return masks and bboxes and their corresponding index
+    def predict(
+        self,
+        video_frames: torch.Tensor,
+        masks: Dict[int, Dict[int, torch.Tensor]],
+        bboxes: Dict[int, Dict[int, List]],
+        categorical_keywords: List[str],
+        unary_keywords: Optional[List[str]] = None,
+        binary_keywords: Optional[List[str]] = None,
+        object_pairs: Optional[List[Tuple[int, int]]] = None,
+        return_top_k: int = 3,
+        return_flattened_segments: Optional[bool] = None,
+        return_valid_pairs: Optional[bool] = None,
+        interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
+        debug_visualizations: Optional[bool] = None,
+    ) -> Dict[str, Any]:
+        """
+        High-level prediction method that returns formatted results.
+        Args:
+            video_frames: Tensor of shape (num_frames, height, width, 3)
+            masks: Dict mapping frame_id -> object_id -> mask tensor
+            bboxes: Dict mapping frame_id -> object_id -> [x1, y1, x2, y2]
+            categorical_keywords: List of category names
+            unary_keywords: Optional list of unary predicates
+            binary_keywords: Optional list of binary predicates
+            object_pairs: Optional list of object pairs for binary relations
+            return_top_k: Number of top predictions to return
+            return_flattened_segments: Whether to include flattened mask/bbox tensors
+            return_valid_pairs: Whether to compute valid object pairs per frame
+            interested_object_pairs: Optional subset of object pairs to track
+        Returns:
+            Formatted prediction results
+        """
+        with torch.no_grad():
+            outputs = self.forward(
+                video_frames=video_frames,
+                masks=masks,
+                bboxes=bboxes,
+                categorical_keywords=categorical_keywords,
+                unary_keywords=unary_keywords,
+                binary_keywords=binary_keywords,
+                object_pairs=object_pairs,
+                return_flattened_segments=return_flattened_segments,
+                return_valid_pairs=return_valid_pairs,
+                interested_object_pairs=interested_object_pairs,
+                debug_visualizations=debug_visualizations,
+            )
+        # Format categorical results
+        formatted_categorical = {}
+        for (obj_id, category), prob in outputs["categorical_probs"][0].items():
+            if obj_id not in formatted_categorical:
+                formatted_categorical[obj_id] = []
+            formatted_categorical[obj_id].append((prob, category))
+        # Sort and take top-k for each object
+        for obj_id in formatted_categorical:
+            formatted_categorical[obj_id] = sorted(
+                formatted_categorical[obj_id], reverse=True
+            )[:return_top_k]
+        # Format unary results
+        formatted_unary = {}
+        for (frame_id, obj_id, predicate), prob in outputs["unary_probs"][0].items():
+            key = (frame_id, obj_id)
+            if key not in formatted_unary:
+                formatted_unary[key] = []
+            formatted_unary[key].append((prob, predicate))
+        # Sort and take top-k
+        for key in formatted_unary:
+            formatted_unary[key] = sorted(
+                formatted_unary[key], reverse=True
+            )[:return_top_k]
+        # Format binary results
+        formatted_binary = {}
+        if len(outputs["binary_probs"]) > 0:
+            for (frame_id, obj_pair, predicate), prob in outputs["binary_probs"][0].items():
+                key = (frame_id, obj_pair)
+                if key not in formatted_binary:
+                    formatted_binary[key] = []
+                formatted_binary[key].append((prob, predicate))
+            # Sort and take top-k
+            for key in formatted_binary:
+                formatted_binary[key] = sorted(
+                    formatted_binary[key], reverse=True
+                )[:return_top_k]
+        result: Dict[str, Any] = {
+            "categorical_predictions": formatted_categorical,
+            "unary_predictions": formatted_unary,
+            "binary_predictions": formatted_binary,
+            "confidence_scores": {
+                "categorical": max([max([p for p, _ in preds], default=0.0)
+                                  for preds in formatted_categorical.values()], default=0.0),
+                "unary": max([max([p for p, _ in preds], default=0.0)
+                             for preds in formatted_unary.values()], default=0.0),
+                "binary": max([max([p for p, _ in preds], default=0.0)
+                              for preds in formatted_binary.values()], default=0.0)
+            }
+        }
+        if "flattened_segments" in outputs:
+            result["flattened_segments"] = outputs["flattened_segments"]
+        if "valid_pairs" in outputs:
+            result["valid_pairs"] = outputs["valid_pairs"]
+        if "valid_pairs_metadata" in outputs:
+            result["valid_pairs_metadata"] = outputs["valid_pairs_metadata"]
+        return result

vis_utils.py ADDED Viewed

	@@ -0,0 +1,941 @@

+import os
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+import random
+import math
+from matplotlib.patches import Rectangle
+import itertools
+from typing import Any, Dict, List, Tuple, Optional, Union
+from laser.preprocess.mask_generation_grounding_dino import mask_to_bbox
+########################################################################################
+##########                         Visualization Library                          ########
+########################################################################################
+# This module renders SAM masks, GroundingDINO boxes, and VINE predictions.
+#
+# Conventions (RGB frames, pixel coords):
+# - Frames: list[np.ndarray] with shape (H, W, 3) in RGB, or np.ndarray with shape (T, H, W, 3).
+# - Masks: 2D boolean arrays (H, W) or tensors convertible to that; (H, W, 1) is also accepted.
+# - BBoxes: (x1, y1, x2, y2) integer pixel coordinates with x2 > x1 and y2 > y1.
+#
+# Per-frame stores use one of:
+# - Dict[int(frame_id) -> Dict[int(obj_id) -> value]]
+# - List indexed by frame_id (each item may be a dict of obj_id->value or a list in order)
+#
+# Renderer inputs/outputs:
+# 1) render_sam_frames(frames, sam_masks, dino_labels=None) -> List[np.ndarray]
+#    - sam_masks: Dict[frame_id, Dict[obj_id, Mask]] or a list; Mask can be np.ndarray or torch.Tensor.
+#    - dino_labels: Optional Dict[obj_id, str] to annotate boxes derived from masks.
+#
+# 2) render_dino_frames(frames, bboxes, dino_labels=None) -> List[np.ndarray]
+#    - bboxes: Dict[frame_id, Dict[obj_id, Sequence[float]]] or a list; each bbox as [x1, y1, x2, y2].
+#
+# 3) render_vine_frames(frames, bboxes, cat_label_lookup, unary_lookup, binary_lookup, masks=None)
+#    -> List[np.ndarray] (the "all" view)
+#    - cat_label_lookup: Dict[obj_id, (label: str, prob: float)]
+#    - unary_lookup: Dict[frame_id, Dict[obj_id, List[(prob: float, label: str)]]]
+#    - binary_lookup: Dict[frame_id, List[((sub_id: int, obj_id: int), List[(prob: float, relation: str)])]]
+#    - masks: Optional; same structure as sam_masks, used for translucent overlays when unary labels exist.
+#
+# Ground-truth helpers used by plotting utilities:
+# - For a single frame, gt_relations is represented as List[(subject_label, object_label, relation_label)].
+#
+# All rendered frames returned by functions are RGB np.ndarray images suitable for saving or video writing.
+########################################################################################
+def clean_label(label):
+    """Replace underscores and slashes with spaces for uniformity."""
+    return label.replace("_", " ").replace("/", " ")
+# Should be performed somewhere else I believe
+def format_cate_preds(cate_preds):
+    # Group object predictions from the model output.
+    obj_pred_dict = {}
+    for (oid, label), prob in cate_preds.items():
+        # Clean the predicted label as well.
+        clean_pred = clean_label(label)
+        if oid not in obj_pred_dict:
+            obj_pred_dict[oid] = []
+        obj_pred_dict[oid].append((clean_pred, prob))
+    for oid in obj_pred_dict:
+        obj_pred_dict[oid].sort(key=lambda x: x[1], reverse=True)
+    return obj_pred_dict
+def format_binary_cate_preds(binary_preds):
+    frame_binary_preds = []
+    for key, score in binary_preds.items():
+        # Expect key format: (frame_id, (subject, object), predicted_relation)
+        try:
+            f_id, (subj, obj), pred_rel = key
+            frame_binary_preds.append((f_id, subj, obj, pred_rel, score))
+        except Exception as e:
+            print("Skipping key with unexpected format:", key)
+            continue
+    frame_binary_preds.sort(key=lambda x: x[3], reverse=True)
+    return frame_binary_preds
+_FONT = cv2.FONT_HERSHEY_SIMPLEX
+def _to_numpy_mask(mask: Union[np.ndarray, torch.Tensor, None]) -> Optional[np.ndarray]:
+    if mask is None:
+        return None
+    if isinstance(mask, torch.Tensor):
+        mask_np = mask.detach().cpu().numpy()
+    else:
+        mask_np = np.asarray(mask)
+    if mask_np.ndim == 0:
+        return None
+    if mask_np.ndim == 3:
+        mask_np = np.squeeze(mask_np)
+    if mask_np.ndim != 2:
+        return None
+    if mask_np.dtype == bool:
+        return mask_np
+    return mask_np > 0
+def _sanitize_bbox(bbox: Union[List[float], Tuple[float, ...], None], width: int, height: int) -> Optional[Tuple[int, int, int, int]]:
+    if bbox is None:
+        return None
+    if isinstance(bbox, (list, tuple)) and len(bbox) >= 4:
+        x1, y1, x2, y2 = [float(b) for b in bbox[:4]]
+    elif isinstance(bbox, np.ndarray) and bbox.size >= 4:
+        x1, y1, x2, y2 = [float(b) for b in bbox.flat[:4]]
+    else:
+        return None
+    x1 = int(np.clip(round(x1), 0, width - 1))
+    y1 = int(np.clip(round(y1), 0, height - 1))
+    x2 = int(np.clip(round(x2), 0, width - 1))
+    y2 = int(np.clip(round(y2), 0, height - 1))
+    if x2 <= x1 or y2 <= y1:
+        return None
+    return (x1, y1, x2, y2)
+def _object_color_bgr(obj_id: int) -> Tuple[int, int, int]:
+    color = get_color(obj_id)
+    rgb = [int(np.clip(c, 0.0, 1.0) * 255) for c in color[:3]]
+    return (rgb[2], rgb[1], rgb[0])
+def _background_color(color: Tuple[int, int, int]) -> Tuple[int, int, int]:
+    return tuple(int(0.25 * 255 + 0.75 * channel) for channel in color)
+def _draw_label_block(
+    image: np.ndarray,
+    lines: List[str],
+    anchor: Tuple[int, int],
+    color: Tuple[int, int, int],
+    font_scale: float = 0.5,
+    thickness: int = 1,
+    direction: str = "up",
+) -> None:
+    if not lines:
+        return
+    img_h, img_w = image.shape[:2]
+    x, y = anchor
+    x = int(np.clip(x, 0, img_w - 1))
+    y_cursor = int(np.clip(y, 0, img_h - 1))
+    bg_color = _background_color(color)
+    if direction == "down":
+        for text in lines:
+            text = str(text)
+            (tw, th), baseline = cv2.getTextSize(text, _FONT, font_scale, thickness)
+            left_x = x
+            right_x = min(left_x + tw + 8, img_w - 1)
+            top_y = int(np.clip(y_cursor + 6, 0, img_h - 1))
+            bottom_y = int(np.clip(top_y + th + baseline + 6, 0, img_h - 1))
+            if bottom_y <= top_y:
+                break
+            cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), bg_color, -1)
+            text_x = left_x + 4
+            text_y = min(bottom_y - baseline - 2, img_h - 1)
+            cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
+            y_cursor = bottom_y
+    else:
+        for text in lines:
+            text = str(text)
+            (tw, th), baseline = cv2.getTextSize(text, _FONT, font_scale, thickness)
+            top_y = max(y_cursor - th - baseline - 6, 0)
+            left_x = x
+            right_x = min(left_x + tw + 8, img_w - 1)
+            bottom_y = min(top_y + th + baseline + 6, img_h - 1)
+            cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), bg_color, -1)
+            text_x = left_x + 4
+            text_y = min(bottom_y - baseline - 2, img_h - 1)
+            cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
+            y_cursor = top_y
+def _draw_centered_label(
+    image: np.ndarray,
+    text: str,
+    center: Tuple[int, int],
+    color: Tuple[int, int, int],
+    font_scale: float = 0.5,
+    thickness: int = 1,
+) -> None:
+    text = str(text)
+    img_h, img_w = image.shape[:2]
+    (tw, th), baseline = cv2.getTextSize(text, _FONT, font_scale, thickness)
+    cx = int(np.clip(center[0], 0, img_w - 1))
+    cy = int(np.clip(center[1], 0, img_h - 1))
+    left_x = int(np.clip(cx - tw // 2 - 4, 0, img_w - 1))
+    top_y = int(np.clip(cy - th // 2 - baseline - 4, 0, img_h - 1))
+    right_x = int(np.clip(left_x + tw + 8, 0, img_w - 1))
+    bottom_y = int(np.clip(top_y + th + baseline + 6, 0, img_h - 1))
+    cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), _background_color(color), -1)
+    text_x = left_x + 4
+    text_y = min(bottom_y - baseline - 2, img_h - 1)
+    cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
+def _extract_frame_entities(store: Union[Dict[int, Dict[int, Any]], List, None], frame_idx: int) -> Dict[int, Any]:
+    if isinstance(store, dict):
+        frame_entry = store.get(frame_idx, {})
+    elif isinstance(store, list) and 0 <= frame_idx < len(store):
+        frame_entry = store[frame_idx]
+    else:
+        frame_entry = {}
+    if isinstance(frame_entry, dict):
+        return frame_entry
+    if isinstance(frame_entry, list):
+        return {i: value for i, value in enumerate(frame_entry)}
+    return {}
+def _label_anchor_and_direction(
+    bbox: Tuple[int, int, int, int],
+    position: str,
+) -> Tuple[Tuple[int, int], str]:
+    x1, y1, x2, y2 = bbox
+    if position == "bottom":
+        return (x1, y2), "down"
+    return (x1, y1), "up"
+def _draw_bbox_with_label(
+    image: np.ndarray,
+    bbox: Tuple[int, int, int, int],
+    obj_id: int,
+    title: Optional[str] = None,
+    sub_lines: Optional[List[str]] = None,
+    label_position: str = "top",
+) -> None:
+    color = _object_color_bgr(obj_id)
+    cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2)
+    head = title if title else f"#{obj_id}"
+    if not head.startswith("#"):
+        head = f"#{obj_id} {head}"
+    lines = [head]
+    if sub_lines:
+        lines.extend(sub_lines)
+    anchor, direction = _label_anchor_and_direction(bbox, label_position)
+    _draw_label_block(image, lines, anchor, color, direction=direction)
+def render_sam_frames(
+    frames: Union[np.ndarray, List[np.ndarray]],
+    sam_masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None],
+    dino_labels: Optional[Dict[int, str]] = None,
+) -> List[np.ndarray]:
+    results: List[np.ndarray] = []
+    frames_iterable = frames if isinstance(frames, list) else list(frames)
+    dino_labels = dino_labels or {}
+    for frame_idx, frame in enumerate(frames_iterable):
+        if frame is None:
+            continue
+        frame_rgb = np.asarray(frame)
+        frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
+        overlay = frame_bgr.astype(np.float32)
+        masks_for_frame = _extract_frame_entities(sam_masks, frame_idx)
+        for obj_id, mask in masks_for_frame.items():
+            mask_np = _to_numpy_mask(mask)
+            if mask_np is None or not np.any(mask_np):
+                continue
+            color = _object_color_bgr(obj_id)
+            alpha = 0.45
+            overlay[mask_np] = (1.0 - alpha) * overlay[mask_np] + alpha * np.array(color, dtype=np.float32)
+        annotated = np.clip(overlay, 0, 255).astype(np.uint8)
+        frame_h, frame_w = annotated.shape[:2]
+        for obj_id, mask in masks_for_frame.items():
+            mask_np = _to_numpy_mask(mask)
+            if mask_np is None or not np.any(mask_np):
+                continue
+            bbox = mask_to_bbox(mask_np)
+            bbox = _sanitize_bbox(bbox, frame_w, frame_h)
+            if not bbox:
+                continue
+            label = dino_labels.get(obj_id)
+            title = f"{label}" if label else None
+            _draw_bbox_with_label(annotated, bbox, obj_id, title=title)
+        results.append(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
+    return results
+def render_dino_frames(
+    frames: Union[np.ndarray, List[np.ndarray]],
+    bboxes: Union[Dict[int, Dict[int, Union[List[float], np.ndarray]]], List, None],
+    dino_labels: Optional[Dict[int, str]] = None,
+) -> List[np.ndarray]:
+    results: List[np.ndarray] = []
+    frames_iterable = frames if isinstance(frames, list) else list(frames)
+    dino_labels = dino_labels or {}
+    for frame_idx, frame in enumerate(frames_iterable):
+        if frame is None:
+            continue
+        frame_rgb = np.asarray(frame)
+        annotated = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
+        frame_h, frame_w = annotated.shape[:2]
+        frame_bboxes = _extract_frame_entities(bboxes, frame_idx)
+        for obj_id, bbox_values in frame_bboxes.items():
+            bbox = _sanitize_bbox(bbox_values, frame_w, frame_h)
+            if not bbox:
+                continue
+            label = dino_labels.get(obj_id)
+            title = f"{label}" if label else None
+            _draw_bbox_with_label(annotated, bbox, obj_id, title=title)
+        results.append(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
+    return results
+def render_vine_frame_sets(
+    frames: Union[np.ndarray, List[np.ndarray]],
+    bboxes: Union[Dict[int, Dict[int, Union[List[float], np.ndarray]]], List, None],
+    cat_label_lookup: Dict[int, Tuple[str, float]],
+    unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
+    binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
+    masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None] = None,
+) -> Dict[str, List[np.ndarray]]:
+    frame_groups: Dict[str, List[np.ndarray]] = {
+        "object": [],
+        "unary": [],
+        "binary": [],
+        "all": [],
+    }
+    frames_iterable = frames if isinstance(frames, list) else list(frames)
+    for frame_idx, frame in enumerate(frames_iterable):
+        if frame is None:
+            continue
+        frame_rgb = np.asarray(frame)
+        base_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
+        frame_h, frame_w = base_bgr.shape[:2]
+        frame_bboxes = _extract_frame_entities(bboxes, frame_idx)
+        frame_masks = _extract_frame_entities(masks, frame_idx) if masks is not None else {}
+        objects_bgr = base_bgr.copy()
+        unary_bgr = base_bgr.copy()
+        binary_bgr = base_bgr.copy()
+        all_bgr = base_bgr.copy()
+        bbox_lookup: Dict[int, Tuple[int, int, int, int]] = {}
+        unary_lines_lookup: Dict[int, List[str]] = {}
+        titles_lookup: Dict[int, Optional[str]] = {}
+        for obj_id, bbox_values in frame_bboxes.items():
+            bbox = _sanitize_bbox(bbox_values, frame_w, frame_h)
+            if not bbox:
+                continue
+            bbox_lookup[obj_id] = bbox
+            cat_label, cat_prob = cat_label_lookup.get(obj_id, (None, None))
+            title_parts = []
+            if cat_label:
+                if cat_prob is not None:
+                    title_parts.append(f"{cat_label} {cat_prob:.2f}")
+                else:
+                    title_parts.append(cat_label)
+            titles_lookup[obj_id] = " ".join(title_parts) if title_parts else None
+            unary_preds = unary_lookup.get(frame_idx, {}).get(obj_id, [])
+            unary_lines = [f"{label} {prob:.2f}" for prob, label in unary_preds]
+            unary_lines_lookup[obj_id] = unary_lines
+        for obj_id, bbox in bbox_lookup.items():
+            unary_lines = unary_lines_lookup.get(obj_id, [])
+            if not unary_lines:
+                continue
+            mask_raw = frame_masks.get(obj_id)
+            mask_np = _to_numpy_mask(mask_raw)
+            if mask_np is None or not np.any(mask_np):
+                continue
+            color = np.array(_object_color_bgr(obj_id), dtype=np.float32)
+            alpha = 0.45
+            for target in (unary_bgr, all_bgr):
+                target_vals = target[mask_np].astype(np.float32)
+                blended = (1.0 - alpha) * target_vals + alpha * color
+                target[mask_np] = np.clip(blended, 0, 255).astype(np.uint8)
+        for obj_id, bbox in bbox_lookup.items():
+            title = titles_lookup.get(obj_id)
+            unary_lines = unary_lines_lookup.get(obj_id, [])
+            _draw_bbox_with_label(objects_bgr, bbox, obj_id, title=title, label_position="top")
+            _draw_bbox_with_label(unary_bgr, bbox, obj_id, title=title, label_position="top")
+            if unary_lines:
+                anchor, direction = _label_anchor_and_direction(bbox, "bottom")
+                _draw_label_block(unary_bgr, unary_lines, anchor, _object_color_bgr(obj_id), direction=direction)
+            _draw_bbox_with_label(binary_bgr, bbox, obj_id, title=title, label_position="top")
+            _draw_bbox_with_label(all_bgr, bbox, obj_id, title=title, label_position="top")
+            if unary_lines:
+                anchor, direction = _label_anchor_and_direction(bbox, "bottom")
+                _draw_label_block(all_bgr, unary_lines, anchor, _object_color_bgr(obj_id), direction=direction)
+        for obj_pair, relation_preds in binary_lookup.get(frame_idx, []):
+            if len(obj_pair) != 2 or not relation_preds:
+                continue
+            subj_id, obj_id = obj_pair
+            subj_bbox = bbox_lookup.get(subj_id)
+            obj_bbox = bbox_lookup.get(obj_id)
+            if not subj_bbox or not obj_bbox:
+                continue
+            start, end = relation_line(subj_bbox, obj_bbox)
+            color = tuple(int(c) for c in np.clip(
+                (np.array(_object_color_bgr(subj_id), dtype=np.float32) +
+                 np.array(_object_color_bgr(obj_id), dtype=np.float32)) / 2.0,
+                0, 255
+            ))
+            prob, relation = relation_preds[0]
+            label_text = f"{relation} {prob:.2f}"
+            mid_point = (int((start[0] + end[0]) / 2), int((start[1] + end[1]) / 2))
+            cv2.line(binary_bgr, start, end, color, 6, cv2.LINE_AA)
+            cv2.line(all_bgr, start, end, color, 6, cv2.LINE_AA)
+            _draw_centered_label(binary_bgr, label_text, mid_point, color)
+            _draw_centered_label(all_bgr, label_text, mid_point, color)
+        frame_groups["object"].append(cv2.cvtColor(objects_bgr, cv2.COLOR_BGR2RGB))
+        frame_groups["unary"].append(cv2.cvtColor(unary_bgr, cv2.COLOR_BGR2RGB))
+        frame_groups["binary"].append(cv2.cvtColor(binary_bgr, cv2.COLOR_BGR2RGB))
+        frame_groups["all"].append(cv2.cvtColor(all_bgr, cv2.COLOR_BGR2RGB))
+    return frame_groups
+def render_vine_frames(
+    frames: Union[np.ndarray, List[np.ndarray]],
+    bboxes: Union[Dict[int, Dict[int, Union[List[float], np.ndarray]]], List, None],
+    cat_label_lookup: Dict[int, Tuple[str, float]],
+    unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
+    binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
+    masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None] = None,
+) -> List[np.ndarray]:
+    return render_vine_frame_sets(
+        frames,
+        bboxes,
+        cat_label_lookup,
+        unary_lookup,
+        binary_lookup,
+        masks,
+    ).get("all", [])
+def color_for_cate_correctness(obj_pred_dict, gt_labels, topk_object):
+    all_colors = []
+    all_texts = []
+    for (obj_id, bbox, gt_label) in gt_labels:
+        preds = obj_pred_dict.get(obj_id, [])
+        if len(preds) == 0:
+            top1 = "N/A"
+            box_color = (0, 0, 255)  # bright red if no prediction
+        else:
+            top1, prob1 = preds[0]
+            topk_labels = [p[0] for p in preds[:topk_object]]
+            # Compare cleaned labels.
+            if top1.lower() == gt_label.lower():
+                box_color = (0, 255, 0)      # bright green for correct
+            elif gt_label.lower() in [p.lower() for p in topk_labels]:
+                box_color = (0, 165, 255)    # bright orange for partial match
+            else:
+                box_color = (0, 0, 255)      # bright red for incorrect
+        label_text = f"ID:{obj_id}/P:{top1}/GT:{gt_label}"
+        all_colors.append(box_color)
+        all_texts.append(label_text)
+    return all_colors, all_texts
+def plot_unary(frame_img, gt_labels, all_colors, all_texts):
+    for (obj_id, bbox, gt_label), box_color, label_text in zip(gt_labels, all_colors, all_texts):
+        x1, y1, x2, y2 = map(int, bbox)
+        cv2.rectangle(frame_img, (x1, y1), (x2, y2), color=box_color, thickness=2)
+        (tw, th), baseline = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+        cv2.rectangle(frame_img, (x1, y1 - th - baseline - 4), (x1 + tw, y1), box_color, -1)
+        cv2.putText(frame_img, label_text, (x1, y1 - 2), cv2.FONT_HERSHEY_SIMPLEX,
+            0.5, (0, 0, 0), 1, cv2.LINE_AA)
+    return frame_img
+def get_white_pane(pane_height,
+                   pane_width=600,
+                   header_height = 50,
+                   header_font = cv2.FONT_HERSHEY_SIMPLEX,
+                   header_font_scale = 0.7,
+                   header_thickness = 2,
+                   header_color = (0, 0, 0)):
+     # Create an expanded white pane to display text info.
+    white_pane = 255 * np.ones((pane_height, pane_width, 3), dtype=np.uint8)
+    # --- Adjust pane split: make predictions column wider (60% vs. 40%) ---
+    left_width = int(pane_width * 0.6)
+    right_width = pane_width - left_width
+    left_pane = white_pane[:, :left_width, :].copy()
+    right_pane = white_pane[:, left_width:, :].copy()
+    cv2.putText(left_pane, "Binary Predictions", (10, header_height - 30),
+                header_font, header_font_scale, header_color, header_thickness, cv2.LINE_AA)
+    cv2.putText(right_pane, "Ground Truth", (10, header_height - 30),
+                header_font, header_font_scale, header_color, header_thickness, cv2.LINE_AA)
+    return white_pane
+# This is for ploting binary prediction results with frame-based scene graphs
+def plot_binary_sg(frame_img,
+                   white_pane,
+                   bin_preds,
+                   gt_relations,
+                   topk_binary,
+                   header_height=50,
+                   indicator_size=20,
+                   pane_width=600):
+     # Leave vertical space for the headers.
+    line_height = 30  # vertical spacing per line
+    x_text = 10       # left margin for text
+    y_text_left = header_height + 10  # starting y for left pane text
+    y_text_right = header_height + 10 # starting y for right pane text
+    # Left section: top-k binary predictions.
+    left_width = int(pane_width * 0.6)
+    right_width = pane_width - left_width
+    left_pane = white_pane[:, :left_width, :].copy()
+    right_pane = white_pane[:, left_width:, :].copy()
+    for (subj, pred_rel, obj, score) in bin_preds[:topk_binary]:
+        correct = any((subj == gt[0] and pred_rel.lower() == gt[2].lower() and obj == gt[1])
+                      for gt in gt_relations)
+        indicator_color = (0, 255, 0) if correct else (0, 0, 255)
+        cv2.rectangle(left_pane, (x_text, y_text_left - indicator_size + 5),
+                      (x_text + indicator_size, y_text_left + 5), indicator_color, -1)
+        text = f"{subj} - {pred_rel} - {obj} :: {score:.2f}"
+        cv2.putText(left_pane, text, (x_text + indicator_size + 5, y_text_left + 5),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 1, cv2.LINE_AA)
+        y_text_left += line_height
+    # Right section: ground truth binary relations.
+    for gt in gt_relations:
+        if len(gt) != 3:
+            continue
+        text = f"{gt[0]} - {gt[2]} - {gt[1]}"
+        cv2.putText(right_pane, text, (x_text, y_text_right + 5),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 1, cv2.LINE_AA)
+        y_text_right += line_height
+    # Combine the two text panes and then with the frame image.
+    combined_pane = np.hstack((left_pane, right_pane))
+    combined_image = np.hstack((frame_img, combined_pane))
+    return combined_image
+def visualized_frame(frame_img,
+                     bboxes,
+                     object_ids,
+                     gt_labels,
+                     cate_preds,
+                     binary_preds,
+                     gt_relations,
+                     topk_object,
+                     topk_binary,
+                     phase="unary"):
+    """Return the combined annotated frame for frame index i as an image (in BGR)."""
+    # Get the frame image (assuming batched_data['batched_reshaped_raw_videos'] is a list of frames)
+    # --- Process Object Predictions (for overlaying bboxes) ---
+    if phase == "unary":
+        objs = []
+        for ((_, f_id, obj_id), bbox, gt_label) in zip(object_ids, bboxes, gt_labels):
+            gt_label = clean_label(gt_label)
+            objs.append((obj_id, bbox, gt_label))
+        formatted_cate_preds = format_cate_preds(cate_preds)
+        all_colors, all_texts = color_for_cate_correctness(formatted_cate_preds, gt_labels, topk_object)
+        updated_frame_img = plot_unary(frame_img, gt_labels, all_colors, all_texts)
+        return updated_frame_img
+    else:
+        # --- Process Binary Predictions & Ground Truth for the Text Pane ---
+        formatted_binary_preds = format_binary_cate_preds(binary_preds)
+        # Ground truth binary relations for the frame.
+        # Clean ground truth relations.
+        gt_relations = [(clean_label(str(s)), clean_label(str(o)), clean_label(rel)) for s, o, rel in gt_relations]
+        pane_width = 600  # increased pane width for more horizontal space
+        pane_height = frame_img.shape[0]
+        # --- Add header labels to each text pane with extra space ---
+        header_height = 50  # increased header space
+        white_pane = get_white_pane(pane_height, pane_width, header_height=header_height)
+        combined_image = plot_binary_sg(frame_img, white_pane, formatted_binary_preds, gt_relations, topk_binary)
+        return combined_image
+def show_mask(mask, ax, obj_id=None, det_class=None, random_color=False):
+    # Ensure mask is a numpy array
+    mask = np.array(mask)
+    # Handle different mask shapes
+    if mask.ndim == 3:
+        # (1, H, W) -> (H, W)
+        if mask.shape[0] == 1:
+            mask = mask.squeeze(0)
+        # (H, W, 1) -> (H, W)
+        elif mask.shape[2] == 1:
+            mask = mask.squeeze(2)
+    # Now mask should be (H, W)
+    assert mask.ndim == 2, f"Mask must be 2D after squeezing, got shape {mask.shape}"
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.8])], axis=0)
+    else:
+        cmap = plt.get_cmap("gist_rainbow")
+        cmap_idx = 0 if obj_id is None else obj_id
+        color = list(cmap((cmap_idx * 47) % 256))
+        color[3] = 0.5
+        color = np.array(color)
+    # Expand mask to (H, W, 1) for broadcasting
+    mask_expanded = mask[..., None]
+    mask_image = mask_expanded * color.reshape(1, 1, -1)
+    # draw a box around the mask with the det_class as the label
+    if not det_class is None:
+        # Find the bounding box coordinates
+        y_indices, x_indices = np.where(mask > 0)
+        if y_indices.size > 0 and x_indices.size > 0:
+            x_min, x_max = x_indices.min(), x_indices.max()
+            y_min, y_max = y_indices.min(), y_indices.max()
+            rect = Rectangle(
+                (x_min, y_min),
+                x_max - x_min,
+                y_max - y_min,
+                linewidth=1.5,
+                edgecolor=color[:3],
+                facecolor="none",
+                alpha=color[3]
+            )
+            ax.add_patch(rect)
+            ax.text(
+                x_min,
+                y_min - 5,
+                f"{det_class}",
+                color="white",
+                fontsize=6,
+                backgroundcolor=np.array(color),
+                alpha=1
+            )
+    ax.imshow(mask_image)
+def save_mask_one_image(frame_image, masks, save_path):
+    """Render masks on top of a frame and store the visualization on disk."""
+    fig, ax = plt.subplots(1, figsize=(6, 6))
+    frame_np = (
+        frame_image.detach().cpu().numpy()
+        if torch.is_tensor(frame_image)
+        else np.asarray(frame_image)
+    )
+    frame_np = np.ascontiguousarray(frame_np)
+    if isinstance(masks, dict):
+        mask_iter = masks.items()
+    else:
+        mask_iter = enumerate(masks)
+    prepared_masks = {
+        obj_id: (
+            mask.detach().cpu().numpy()
+            if torch.is_tensor(mask)
+            else np.asarray(mask)
+        )
+        for obj_id, mask in mask_iter
+    }
+    ax.imshow(frame_np)
+    ax.axis("off")
+    for obj_id, mask_np in prepared_masks.items():
+        show_mask(mask_np, ax, obj_id=obj_id, det_class=None, random_color=False)
+    fig.savefig(save_path, bbox_inches="tight", pad_inches=0)
+    plt.close(fig)
+    return save_path
+def get_video_masks_visualization(video_tensor,
+                                  video_masks,
+                                  video_id,
+                                  video_save_base_dir,
+                                  oid_class_pred=None,
+                                  sample_rate = 1):
+    video_save_dir = os.path.join(video_save_base_dir, video_id)
+    if not os.path.exists(video_save_dir):
+        os.makedirs(video_save_dir, exist_ok=True)
+    for frame_id, image in enumerate(video_tensor):
+        if frame_id not in video_masks:
+            print("No mask for Frame", frame_id)
+            continue
+        masks = video_masks[frame_id]
+        save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
+        get_mask_one_image(image, masks, oid_class_pred)
+def get_mask_one_image(frame_image, masks, oid_class_pred=None):
+    # Create a figure and axis
+    fig, ax = plt.subplots(1, figsize=(6, 6))
+    # Display the frame image
+    ax.imshow(frame_image)
+    ax.axis('off')
+    if type(masks) == list:
+        masks = {i: m for i, m in enumerate(masks)}
+    # Add the masks
+    for obj_id, mask in masks.items():
+        det_class = f"{obj_id}. {oid_class_pred[obj_id]}" if not oid_class_pred is None else None
+        show_mask(mask, ax, obj_id=obj_id, det_class=det_class, random_color=False)
+    # Show the plot
+    return fig, ax
+def save_video(frames, output_filename, output_fps):
+    # --- Create a video from all frames ---
+    num_frames = len(frames)
+    frame_h, frame_w = frames.shape[:2]
+    # Use a codec supported by VS Code (H.264 via 'avc1').
+    fourcc = cv2.VideoWriter_fourcc(*'avc1')
+    out = cv2.VideoWriter(output_filename, fourcc, output_fps, (frame_w, frame_h))
+    print(f"Processing {num_frames} frames...")
+    for i in range(num_frames):
+        vis_frame = get_visualized_frame(i)
+        out.write(vis_frame)
+        if i % 10 == 0:
+            print(f"Processed frame {i+1}/{num_frames}")
+    out.release()
+    print(f"Video saved as {output_filename}")
+def list_depth(lst):
+    """Calculates the depth of a nested list."""
+    if not (isinstance(lst, list) or isinstance(lst, torch.Tensor)):
+        return 0
+    elif (isinstance(lst, torch.Tensor) and lst.shape == torch.Size([])) or (isinstance(lst, list) and len(lst) == 0):
+        return 1
+    else:
+        return 1 + max(list_depth(item) for item in lst)
+def normalize_prompt(points, labels):
+    if list_depth(points) == 3:
+        points = torch.stack([p.unsqueeze(0) for p in points])
+        labels = torch.stack([l.unsqueeze(0) for l in labels])
+    return points, labels
+def show_box(box, ax, object_id):
+    if len(box) == 0:
+        return
+    cmap = plt.get_cmap("gist_rainbow")
+    cmap_idx = 0 if object_id is None else object_id
+    color = list(cmap((cmap_idx * 47) % 256))
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor=color, facecolor=(0,0,0,0), lw=2))
+def show_points(coords, labels, ax, object_id=None, marker_size=375):
+    if len(labels) == 0:
+        return
+    pos_points = coords[labels==1]
+    neg_points = coords[labels==0]
+    cmap = plt.get_cmap("gist_rainbow")
+    cmap_idx = 0 if object_id is None else object_id
+    color = list(cmap((cmap_idx * 47) % 256))
+    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='P', s=marker_size, edgecolor=color, linewidth=1.25)
+    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='s', s=marker_size, edgecolor=color, linewidth=1.25)
+def save_prompts_one_image(frame_image, boxes, points, labels, save_path):
+    # Create a figure and axis
+    fig, ax = plt.subplots(1, figsize=(6, 6))
+    # Display the frame image
+    ax.imshow(frame_image)
+    ax.axis('off')
+    points, labels = normalize_prompt(points, labels)
+    if type(boxes) == torch.Tensor:
+        for object_id, box in enumerate(boxes):
+            # Add the bounding boxes
+            if not box is None:
+                show_box(box.cpu(), ax, object_id=object_id)
+    elif type(boxes) == dict:
+        for object_id, box in boxes.items():
+            # Add the bounding boxes
+            if not box is None:
+                show_box(box.cpu(), ax, object_id=object_id)
+    elif type(boxes) == list and len(boxes) == 0:
+        pass
+    else:
+        raise Exception()
+    for object_id, (point_ls, label_ls) in enumerate(zip(points, labels)):
+        if not len(point_ls) == 0:
+            show_points(point_ls.cpu(), label_ls.cpu(), ax, object_id=object_id)
+    # Show the plot
+    plt.savefig(save_path)
+    plt.close()
+def save_video_prompts_visualization(video_tensor, video_boxes, video_points, video_labels, video_id, video_save_base_dir):
+    video_save_dir = os.path.join(video_save_base_dir, video_id)
+    if not os.path.exists(video_save_dir):
+        os.makedirs(video_save_dir, exist_ok=True)
+    for frame_id, image in enumerate(video_tensor):
+        boxes, points, labels = [], [], []
+        if frame_id in video_boxes:
+            boxes = video_boxes[frame_id]
+        if frame_id in video_points:
+            points = video_points[frame_id]
+        if frame_id in video_labels:
+            labels = video_labels[frame_id]
+        save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
+        save_prompts_one_image(image, boxes, points, labels, save_path)
+def save_video_masks_visualization(video_tensor, video_masks, video_id, video_save_base_dir, oid_class_pred=None, sample_rate = 1):
+    video_save_dir = os.path.join(video_save_base_dir, video_id)
+    if not os.path.exists(video_save_dir):
+        os.makedirs(video_save_dir, exist_ok=True)
+    for frame_id, image in enumerate(video_tensor):
+        if random.random() > sample_rate:
+            continue
+        if frame_id not in video_masks:
+            print("No mask for Frame", frame_id)
+            continue
+        masks = video_masks[frame_id]
+        save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
+        save_mask_one_image(image, masks, save_path)
+def get_color(obj_id, cmap_name="gist_rainbow",alpha=0.5):
+    cmap = plt.get_cmap(cmap_name)
+    cmap_idx = 0 if obj_id is None else obj_id
+    color = list(cmap((cmap_idx * 47) % 256))
+    color[3] = 0.5
+    color = np.array(color)
+    return color
+def _bbox_center(bbox: Tuple[int, int, int, int]) -> Tuple[float, float]:
+    return ((bbox[0] + bbox[2]) / 2.0, (bbox[1] + bbox[3]) / 2.0)
+def relation_line(
+    bbox1: Tuple[int, int, int, int],
+    bbox2: Tuple[int, int, int, int],
+) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+    """
+    Returns integer pixel centers suitable for drawing a relation line. For
+    coincident boxes, nudges the target center to ensure the segment has span.
+    """
+    center1 = _bbox_center(bbox1)
+    center2 = _bbox_center(bbox2)
+    if math.isclose(center1[0], center2[0], abs_tol=1e-3) and math.isclose(center1[1], center2[1], abs_tol=1e-3):
+        offset = max(1.0, (bbox2[2] - bbox2[0]) * 0.05)
+        center2 = (center2[0] + offset, center2[1])
+    start = (int(round(center1[0])), int(round(center1[1])))
+    end = (int(round(center2[0])), int(round(center2[1])))
+    if start == end:
+        end = (end[0] + 1, end[1])
+    return start, end
+def get_binary_mask_one_image(frame_image, masks, rel_pred_ls=None):
+    # Create a figure and axis
+    fig, ax = plt.subplots(1, figsize=(6, 6))
+    # Display the frame image
+    ax.imshow(frame_image)
+    ax.axis('off')
+    all_objs_to_show = set()
+    all_lines_to_show = []
+    # print(rel_pred_ls[0])
+    for (from_obj_id, to_obj_id), rel_text in rel_pred_ls.items():
+        all_objs_to_show.add(from_obj_id)
+        all_objs_to_show.add(to_obj_id)
+        from_mask = masks[from_obj_id]
+        bbox1 = mask_to_bbox(from_mask)
+        to_mask = masks[to_obj_id]
+        bbox2 = mask_to_bbox(to_mask)
+        c1, c2 = shortest_line_between_bboxes(bbox1, bbox2)
+        line_color = get_color(from_obj_id)
+        face_color = get_color(to_obj_id)
+        line = c1, c2, face_color, line_color, rel_text
+        all_lines_to_show.append(line)
+    masks_to_show = {}
+    for oid in all_objs_to_show:
+        masks_to_show[oid] = masks[oid]
+    # Add the masks
+    for obj_id, mask in masks_to_show.items():
+        show_mask(mask, ax, obj_id=obj_id, random_color=False)
+    for (from_pt_x, from_pt_y), (to_pt_x, to_pt_y), face_color, line_color, rel_text in all_lines_to_show:
+        plt.plot([from_pt_x, to_pt_x], [from_pt_y, to_pt_y], color=line_color, linestyle='-', linewidth=3)
+        mid_pt_x = (from_pt_x + to_pt_x) / 2
+        mid_pt_y = (from_pt_y + to_pt_y) / 2
+        ax.text(
+                mid_pt_x - 5,
+                mid_pt_y,
+                rel_text,
+                color="white",
+                fontsize=6,
+                backgroundcolor=np.array(line_color),
+                bbox=dict(facecolor=face_color, edgecolor=line_color, boxstyle='round,pad=1'),
+                alpha=1
+            )
+    # Show the plot
+    return fig, ax