BcantCode
/

privi-gaze-distill

Model card Files Files and versions

xet

Community

BcantCode commited on 5 days ago

Commit

e97351b

verified ·

1 Parent(s): c07e4db

Upload models/teacher.py

Browse files

Files changed (1) hide show

models/teacher.py +214 -0

models/teacher.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""
+PriviGaze Teacher Model - Siamese Multi-Input Gaze Estimation Network
+Architecture:
+- Takes 3 inputs: left eye RGB, right eye RGB, blurred grayscale face
+- Uses ConvNeXtV2-Atto as shared backbone for eye streams
+- Uses ConvNeXtV2-Nano for face stream
+- Fuses multi-modal features via cross-attention
+- Outputs: pitch and yaw gaze angles (degrees)
+This teacher has access to privileged information (RGB eye crops, high-res face)
+that the student does NOT have at inference time.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import ConvNextV2Model, ConvNextV2Config
+class ConvNextV2FeatureExtractor(nn.Module):
+    """Wrapper around ConvNeXtV2 for feature extraction (no classification head)."""
+    def __init__(self, model_name: str, output_dim: int = 256):
+        super().__init__()
+        self.backbone = ConvNextV2Model.from_pretrained(model_name)
+        # Freeze early layers, fine-tune later stages
+        self._setup_gradient_checkpointing()
+        # Determine feature dimension from backbone
+        hidden_size = self.backbone.config.hidden_sizes[-1]
+        self.projection = nn.Sequential(
+            nn.LayerNorm(hidden_size),
+            nn.Linear(hidden_size, output_dim),
+            nn.GELU(),
+        )
+    def _setup_gradient_checkpointing(self):
+        self.backbone.gradient_checkpointing_enable()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Extract features from input image.
+        Args:
+            x: [B, 3, H, W] RGB image tensor
+        Returns:
+            features: [B, output_dim]
+        """
+        outputs = self.backbone(x)
+        # Use pooled output (avg pool over spatial dims)
+        pooled = outputs.pooler_output  # [B, hidden_size]
+        return self.projection(pooled)
+class CrossAttentionFusion(nn.Module):
+    """Cross-attention fusion module for multi-modal features."""
+    def __init__(self, dim: int = 256, num_heads: int = 4):
+        super().__init__()
+        self.cross_attn = nn.MultiheadAttention(dim, num_heads, batch_first=True)
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, dim * 4),
+            nn.GELU(),
+            nn.Linear(dim * 4, dim),
+        )
+    def forward(self, face_feat: torch.Tensor, eye_feats: torch.Tensor) -> torch.Tensor:
+        """Fuse face and eye features via cross-attention.
+        Args:
+            face_feat: [B, dim]
+            eye_feats: [B, 2, dim] - left and right eye features concatenated
+        Returns:
+            fused: [B, dim]
+        """
+        # Reshape for attention: [B, 1, dim] for face, [B, 2, dim] for eyes
+        face_seq = face_feat.unsqueeze(1)  # [B, 1, dim]
+        eye_seq = eye_feats  # [B, 2, dim]
+        # Cross-attention: face attends to eye features
+        attn_out, _ = self.cross_attn(face_seq, eye_seq, eye_seq)
+        out = self.norm1(face_seq + attn_out)
+        out = self.norm2(out + self.ffn(out))
+        return out.squeeze(1)  # [B, dim]
+class PriviGazeTeacher(nn.Module):
+    """Siamese teacher model with privileged multi-modal inputs.
+    Inputs:
+        - left_eye: [B, 3, 112, 112] RGB left eye crop
+        - right_eye: [B, 3, 112, 112] RGB right eye crop
+        - face_blurred_gray: [B, 1, 224, 224] Blurred grayscale face (only geometric info)
+    Outputs:
+        - pitch: [B] gaze pitch angle in degrees
+        - yaw: [B] gaze yaw angle in degrees
+        - features: [B, 256] fused feature representation for distillation
+    """
+    def __init__(
+        self,
+        eye_backbone: str = "facebook/convnextv2-atto-1k-224",
+        face_backbone: str = "facebook/convnextv2-nano-22k-384",
+        feature_dim: int = 256,
+        gaze_bins: int = 90,  # -90 to +90 degrees, binned
+    ):
+        super().__init__()
+        # Eye feature extractors (shared weights for left and right)
+        self.eye_extractor = ConvNextV2FeatureExtractor(eye_backbone, feature_dim)
+        # Face feature extractor (takes 1-channel input, adapt first conv)
+        self.face_extractor = ConvNextV2FeatureExtractor(face_backbone, feature_dim)
+        # Eye fusion via self-attention
+        self.eye_fusion = nn.Sequential(
+            nn.Linear(feature_dim * 2, feature_dim),
+            nn.GELU(),
+            nn.LayerNorm(feature_dim),
+        )
+        # Cross-modal fusion
+        self.cross_fusion = CrossAttentionFusion(feature_dim, num_heads=4)
+        # Gaze regression heads (one per angle - L2CS-Net style)
+        self.pitch_head = nn.Sequential(
+            nn.Linear(feature_dim, feature_dim // 2),
+            nn.GELU(),
+            nn.Dropout(0.1),
+            nn.Linear(feature_dim // 2, gaze_bins),  # Binned classification
+        )
+        self.yaw_head = nn.Sequential(
+            nn.Linear(feature_dim, feature_dim // 2),
+            nn.GELU(),
+            nn.Dropout(0.1),
+            nn.Linear(feature_dim // 2, gaze_bins),  # Binned classification
+        )
+        # Bin centers for expectation-based regression
+        self.register_buffer(
+            'bin_centers',
+            torch.linspace(-90.0, 90.0, gaze_bins)
+        )
+        self.feature_dim = feature_dim
+        self.gaze_bins = gaze_bins
+    def _adapt_face_input(self, x: torch.Tensor) -> torch.Tensor:
+        """Adapt 1-channel grayscale face input to 3-channel for ConvNeXtV2.
+        The first conv layer expects 3 channels. We replicate the grayscale
+        channel 3 times. The model learns to treat this as a geometric-only signal
+        since high-frequency texture/color is removed by blurring.
+        """
+        if x.shape[1] == 1:
+            x = x.repeat(1, 3, 1, 1)
+        return x
+    def forward(self, left_eye, right_eye, face_blurred_gray):
+        """
+        Args:
+            left_eye: [B, 3, 112, 112]
+            right_eye: [B, 3, 112, 112]
+            face_blurred_gray: [B, 1, 224, 224]
+        Returns:
+            pitch_pred: [B] gaze pitch in degrees
+            yaw_pred: [B] gaze yaw in degrees
+            fused_features: [B, feature_dim]
+        """
+        # Extract features from each modality
+        left_feat = self.eye_extractor(left_eye)        # [B, dim]
+        right_feat = self.eye_extractor(right_eye)       # [B, dim]
+        face_input = self._adapt_face_input(face_blurred_gray)
+        face_feat = self.face_extractor(face_input)      # [B, dim]
+        # Fuse eye features
+        eye_combined = torch.cat([left_feat, right_feat], dim=-1)  # [B, 2*dim]
+        eye_fused = self.eye_fusion(eye_combined)                   # [B, dim]
+        # Stack eye features for cross-attention
+        eye_stacked = torch.stack([left_feat, right_feat], dim=1)   # [B, 2, dim]
+        # Cross-modal fusion: face attends to eye features
+        fused = self.cross_fusion(face_feat, eye_stacked)           # [B, dim]
+        # Add eye fused features
+        fused = fused + eye_fused  # residual connection
+        # Predict gaze angles using L2CS-Net style binned regression
+        pitch_logits = self.pitch_head(fused)  # [B, gaze_bins]
+        yaw_logits = self.yaw_head(fused)      # [B, gaze_bins]
+        # Softmax + expectation for fine-grained regression
+        pitch_probs = F.softmax(pitch_logits, dim=-1)
+        yaw_probs = F.softmax(yaw_logits, dim=-1)
+        pitch_pred = (pitch_probs * self.bin_centers).sum(dim=-1)  # [B]
+        yaw_pred = (yaw_probs * self.bin_centers).sum(dim=-1)      # [B]
+        return pitch_pred, yaw_pred, fused
+    def get_penultimate_features(self, left_eye, right_eye, face_blurred_gray):
+        """Return features before the regression heads for distillation."""
+        _, _, fused = self.forward(left_eye, right_eye, face_blurred_gray)
+        return fused