palette-edit-classifier / models /hsl_feature_extractor.py

Upload models/hsl_feature_extractor.py with huggingface_hub

41bfbd1 verified 20 days ago

3.29 kB

	"""
	HSL Feature Extractor

	Replaces PaletteFeatureExtractor (which uses nn.Embedding for token IDs)
	for the HSL color pipeline.

	Input: (B, H, W, 3) FloatTensor — HSL palette with channels [h, s, l] in [0, 1]
	Output: (B, H, W, D) FloatTensor — spatial features

	Architecture:
	1. Circular hue encoding: h -> (sin(2pih), cos(2pih))
	2. Stack: [sin_h, cos_h, s, l] -> 4D tensor
	3. Linear projection: nn.Linear(4, hidden_dim)
	4. VisionTransformer: reuse existing VisionTransformer from models.vit
	"""

	import math
	import torch
	import torch.nn as nn

	from .vit import VisionTransformer, trunc_normal_init_


	class HSLFeatureExtractor(nn.Module):
	"""
	Feature extractor for HSL color palettes.

	Uses circular hue encoding (sin/cos) to handle hue's circular nature
	(hue 0 ≈ hue 1), then projects the 4D encoded features through a linear
	layer and a VisionTransformer for spatial feature extraction.

	Args:
	hidden_dim: Transformer hidden dimension (default: 768)
	num_layers: Number of transformer layers (default: 6)
	num_heads: Number of attention heads (default: 8)
	patch_size: Patch size for ViT patchification (default: 4)
	dropout: Dropout probability (default: 0.1)
	"""

	def __init__(
	self,
	hidden_dim: int = 768,
	num_layers: int = 6,
	num_heads: int = 8,
	patch_size: int = 4,
	dropout: float = 0.1,
	):
	super().__init__()

	self.hidden_dim = hidden_dim

	# Project 4D circular-encoded HSL to hidden_dim
	self.hsl_proj = nn.Linear(4, hidden_dim, bias=True)

	# Vision Transformer for spatial feature extraction
	self.vit = VisionTransformer(
	hidden_dim=hidden_dim,
	num_layers=num_layers,
	num_heads=num_heads,
	patch_size=patch_size,
	dropout=dropout,
	)

	# Initialize hsl_proj weights with truncated normal
	self._init_weights()

	def _init_weights(self):
	"""Initialize hsl_proj weights with truncated normal."""
	std = 1.0 / math.sqrt(self.hsl_proj.in_features)
	trunc_normal_init_(self.hsl_proj.weight, std=std)
	if self.hsl_proj.bias is not None:
	self.hsl_proj.bias.data.zero_()

	def forward(self, palette_hsl: torch.Tensor) -> torch.Tensor:
	"""
	Extract spatial features from an HSL palette.

	Args:
	palette_hsl: (B, H, W, 3) FloatTensor with channels [h, s, l] in [0, 1]

	Returns:
	(B, H, W, D) FloatTensor spatial features
	"""
	# Split channels
	h = palette_hsl[..., 0] # (B, H, W)
	s = palette_hsl[..., 1] # (B, H, W)
	l = palette_hsl[..., 2] # (B, H, W)

	# Circular hue encoding — handles wraparound: hue 0 ≈ hue 1
	sin_h = torch.sin(2 * math.pi * h) # (B, H, W)
	cos_h = torch.cos(2 * math.pi * h) # (B, H, W)

	# Stack into 4-channel tensor
	encoded = torch.stack([sin_h, cos_h, s, l], dim=-1) # (B, H, W, 4)

	# Project to hidden_dim
	embedded = self.hsl_proj(encoded) # (B, H, W, D)

	# Apply VisionTransformer for spatial feature extraction
	features = self.vit(embedded) # (B, H, W, D)

	return features