Spaces:

sajith-0701
/

SentinelAI

Sleeping

File size: 4,895 Bytes

71c1ad2

# app/models/clip_model.py
# CLIP model for multimodal text-image alignment (deep analysis only)

from PIL import Image
import numpy as np
from app.config import get_settings
from app.observability.logging import get_logger

logger = get_logger(__name__)


class CLIPModel:
    """
    CLIP (Contrastive Language-Image Pre-Training) model.

    Used in the deep analysis path to compute semantic alignment
    between text descriptions and image content. This helps detect
    subtle multimodal threats (e.g., threatening text overlaid on images).
    """

    def __init__(self):
        self.settings = get_settings()
        self.model = None
        self.preprocess = None
        self.tokenizer = None
        self._loaded = False
        self.device = None

    def load(self) -> None:
        """Load the CLIP model and preprocessor."""
        import torch
        try:
            import open_clip

            model_name = self.settings.clip_model_name
            cache_dir = self.settings.model_cache_path / "clip"
            cache_dir.mkdir(parents=True, exist_ok=True)

            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

            logger.info("loading_clip_model", model=model_name)

            # Use OpenCLIP for flexibility
            self.model, _, self.preprocess = open_clip.create_model_and_transforms(
                "ViT-B-32",
                pretrained="laion2b_s34b_b79k",
            )
            self.model = self.model.to(self.device)
            self.model.eval()

            self.tokenizer = open_clip.get_tokenizer("ViT-B-32")

            self._loaded = True
            logger.info("clip_model_loaded")

        except ImportError:
            logger.warning("clip_not_available", reason="open_clip not installed")
            self._loaded = False
        except Exception as e:
            logger.error("clip_load_failed", error=str(e))
            self._loaded = False

    def compute_similarity(self, image: Image.Image, texts: list[str]) -> dict:
        """
        Compute cosine similarity between an image and a list of text descriptions.

        Args:
            image: PIL Image.
            texts: List of text descriptions to compare against.

        Returns:
            Dict with similarities, best_match, and best_score.
        """
        if not self._loaded:
            return {"error": "CLIP model not loaded", "similarities": []}

        import torch

        # Preprocess image
        image_input = self.preprocess(image).unsqueeze(0).to(self.device)

        # Tokenize texts
        text_tokens = self.tokenizer(texts).to(self.device)

        with torch.no_grad():
            image_features = self.model.encode_image(image_input)
            text_features = self.model.encode_text(text_tokens)

            # Normalize
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)

            # Cosine similarity
            similarities = (image_features @ text_features.T).squeeze(0).cpu().numpy()

        sim_list = similarities.tolist()
        best_idx = int(np.argmax(sim_list))

        return {
            "similarities": dict(zip(texts, sim_list)),
            "best_match": texts[best_idx],
            "best_score": sim_list[best_idx],
        }

    def align_content(self, image: Image.Image, context_text: str | None = None) -> dict:
        """
        Analyze image alignment with harmful content categories.

        Args:
            image: Image to analyze.
            context_text: Optional surrounding text context.

        Returns:
            Dict with category alignment scores.
        """
        harmful_descriptions = [
            "a photo containing violence, fighting, or physical harm",
            "a photo containing nudity or sexual content",
            "a photo containing self-harm or suicide imagery",
            "a photo containing hate symbols or extremist content",
            "a photo containing drugs or substance abuse",
            "a safe and appropriate photo for children",
        ]

        result = self.compute_similarity(image, harmful_descriptions)

        if "error" in result:
            return result

        # Also check text-image alignment if context provided
        text_alignment = None
        if context_text:
            text_result = self.compute_similarity(image, [context_text, "unrelated content"])
            text_alignment = text_result["similarities"].get(context_text, 0.0)

        return {
            "category_scores": result["similarities"],
            "most_aligned": result["best_match"],
            "alignment_score": result["best_score"],
            "text_image_alignment": text_alignment,
        }

    @property
    def is_loaded(self) -> bool:
        return self._loaded