Jonttup
/

palette-edit-classifier

Model card Files Files and versions

xet

Community

Jonttup commited on 21 days ago

Commit

41bfbd1

verified ·

1 Parent(s): 2f9ad67

Upload models/hsl_feature_extractor.py with huggingface_hub

Browse files

Files changed (1) hide show

models/hsl_feature_extractor.py +102 -0

models/hsl_feature_extractor.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+HSL Feature Extractor
+Replaces PaletteFeatureExtractor (which uses nn.Embedding for token IDs)
+for the HSL color pipeline.
+Input:  (B, H, W, 3) FloatTensor — HSL palette with channels [h, s, l] in [0, 1]
+Output: (B, H, W, D) FloatTensor — spatial features
+Architecture:
+1. Circular hue encoding: h -> (sin(2*pi*h), cos(2*pi*h))
+2. Stack: [sin_h, cos_h, s, l] -> 4D tensor
+3. Linear projection: nn.Linear(4, hidden_dim)
+4. VisionTransformer: reuse existing VisionTransformer from models.vit
+"""
+import math
+import torch
+import torch.nn as nn
+from .vit import VisionTransformer, trunc_normal_init_
+class HSLFeatureExtractor(nn.Module):
+    """
+    Feature extractor for HSL color palettes.
+    Uses circular hue encoding (sin/cos) to handle hue's circular nature
+    (hue 0 ≈ hue 1), then projects the 4D encoded features through a linear
+    layer and a VisionTransformer for spatial feature extraction.
+    Args:
+        hidden_dim: Transformer hidden dimension (default: 768)
+        num_layers: Number of transformer layers (default: 6)
+        num_heads: Number of attention heads (default: 8)
+        patch_size: Patch size for ViT patchification (default: 4)
+        dropout: Dropout probability (default: 0.1)
+    """
+    def __init__(
+        self,
+        hidden_dim: int = 768,
+        num_layers: int = 6,
+        num_heads: int = 8,
+        patch_size: int = 4,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        # Project 4D circular-encoded HSL to hidden_dim
+        self.hsl_proj = nn.Linear(4, hidden_dim, bias=True)
+        # Vision Transformer for spatial feature extraction
+        self.vit = VisionTransformer(
+            hidden_dim=hidden_dim,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            patch_size=patch_size,
+            dropout=dropout,
+        )
+        # Initialize hsl_proj weights with truncated normal
+        self._init_weights()
+    def _init_weights(self):
+        """Initialize hsl_proj weights with truncated normal."""
+        std = 1.0 / math.sqrt(self.hsl_proj.in_features)
+        trunc_normal_init_(self.hsl_proj.weight, std=std)
+        if self.hsl_proj.bias is not None:
+            self.hsl_proj.bias.data.zero_()
+    def forward(self, palette_hsl: torch.Tensor) -> torch.Tensor:
+        """
+        Extract spatial features from an HSL palette.
+        Args:
+            palette_hsl: (B, H, W, 3) FloatTensor with channels [h, s, l] in [0, 1]
+        Returns:
+            (B, H, W, D) FloatTensor spatial features
+        """
+        # Split channels
+        h = palette_hsl[..., 0]  # (B, H, W)
+        s = palette_hsl[..., 1]  # (B, H, W)
+        l = palette_hsl[..., 2]  # (B, H, W)
+        # Circular hue encoding — handles wraparound: hue 0 ≈ hue 1
+        sin_h = torch.sin(2 * math.pi * h)  # (B, H, W)
+        cos_h = torch.cos(2 * math.pi * h)  # (B, H, W)
+        # Stack into 4-channel tensor
+        encoded = torch.stack([sin_h, cos_h, s, l], dim=-1)  # (B, H, W, 4)
+        # Project to hidden_dim
+        embedded = self.hsl_proj(encoded)  # (B, H, W, D)
+        # Apply VisionTransformer for spatial feature extraction
+        features = self.vit(embedded)  # (B, H, W, D)
+        return features