BcantCode
/

privi-gaze-distill

Model card Files Files and versions

xet

Community

BcantCode commited on 5 days ago

Commit

01809ab

verified ·

1 Parent(s): 10d67d3

Upload models/teacher.py

Browse files

Files changed (1) hide show

models/teacher.py +29 -93

models/teacher.py CHANGED Viewed

@@ -15,7 +15,7 @@ that the student does NOT have at inference time.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import ConvNextV2Model, ConvNextV2Config
 class ConvNextV2FeatureExtractor(nn.Module):
@@ -24,10 +24,8 @@ class ConvNextV2FeatureExtractor(nn.Module):
     def __init__(self, model_name: str, output_dim: int = 256):
         super().__init__()
         self.backbone = ConvNextV2Model.from_pretrained(model_name)
-        # Freeze early layers, fine-tune later stages
-        self._setup_gradient_checkpointing()
-        # Determine feature dimension from backbone
         hidden_size = self.backbone.config.hidden_sizes[-1]
         self.projection = nn.Sequential(
             nn.LayerNorm(hidden_size),
@@ -35,21 +33,9 @@ class ConvNextV2FeatureExtractor(nn.Module):
             nn.GELU(),
         )
-    def _setup_gradient_checkpointing(self):
-        self.backbone.gradient_checkpointing_enable()
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Extract features from input image.
-        Args:
-            x: [B, 3, H, W] RGB image tensor
-        Returns:
-            features: [B, output_dim]
-        """
         outputs = self.backbone(x)
-        # Use pooled output (avg pool over spatial dims)
-        pooled = outputs.pooler_output  # [B, hidden_size]
         return self.projection(pooled)
@@ -68,25 +54,11 @@ class CrossAttentionFusion(nn.Module):
         )
     def forward(self, face_feat: torch.Tensor, eye_feats: torch.Tensor) -> torch.Tensor:
-        """Fuse face and eye features via cross-attention.
-        Args:
-            face_feat: [B, dim]
-            eye_feats: [B, 2, dim] - left and right eye features concatenated
-        Returns:
-            fused: [B, dim]
-        """
-        # Reshape for attention: [B, 1, dim] for face, [B, 2, dim] for eyes
-        face_seq = face_feat.unsqueeze(1)  # [B, 1, dim]
-        eye_seq = eye_feats  # [B, 2, dim]
-        # Cross-attention: face attends to eye features
-        attn_out, _ = self.cross_attn(face_seq, eye_seq, eye_seq)
         out = self.norm1(face_seq + attn_out)
         out = self.norm2(out + self.ffn(out))
-        return out.squeeze(1)  # [B, dim]
 class PriviGazeTeacher(nn.Module):
@@ -95,11 +67,13 @@ class PriviGazeTeacher(nn.Module):
     Inputs:
         - left_eye: [B, 3, 112, 112] RGB left eye crop
         - right_eye: [B, 3, 112, 112] RGB right eye crop
-        - face_blurred_gray: [B, 1, 224, 224] Blurred grayscale face (only geometric info)
     Outputs:
-        - pitch: [B] gaze pitch angle in degrees
-        - yaw: [B] gaze yaw angle in degrees
         - features: [B, 256] fused feature representation for distillation
     """
@@ -108,107 +82,69 @@ class PriviGazeTeacher(nn.Module):
         eye_backbone: str = "facebook/convnextv2-atto-1k-224",
         face_backbone: str = "facebook/convnextv2-nano-22k-384",
         feature_dim: int = 256,
-        gaze_bins: int = 90,  # -90 to +90 degrees, binned
     ):
         super().__init__()
-        # Eye feature extractors (shared weights for left and right)
         self.eye_extractor = ConvNextV2FeatureExtractor(eye_backbone, feature_dim)
-        # Face feature extractor (takes 1-channel input, adapt first conv)
         self.face_extractor = ConvNextV2FeatureExtractor(face_backbone, feature_dim)
-        # Eye fusion via self-attention
         self.eye_fusion = nn.Sequential(
             nn.Linear(feature_dim * 2, feature_dim),
             nn.GELU(),
             nn.LayerNorm(feature_dim),
         )
-        # Cross-modal fusion
         self.cross_fusion = CrossAttentionFusion(feature_dim, num_heads=4)
-        # Gaze regression heads (one per angle - L2CS-Net style)
         self.pitch_head = nn.Sequential(
             nn.Linear(feature_dim, feature_dim // 2),
             nn.GELU(),
             nn.Dropout(0.1),
-            nn.Linear(feature_dim // 2, gaze_bins),  # Binned classification
         )
         self.yaw_head = nn.Sequential(
             nn.Linear(feature_dim, feature_dim // 2),
             nn.GELU(),
             nn.Dropout(0.1),
-            nn.Linear(feature_dim // 2, gaze_bins),  # Binned classification
-        )
-        # Bin centers for expectation-based regression
-        self.register_buffer(
-            'bin_centers',
-            torch.linspace(-90.0, 90.0, gaze_bins)
         )
         self.feature_dim = feature_dim
         self.gaze_bins = gaze_bins
     def _adapt_face_input(self, x: torch.Tensor) -> torch.Tensor:
-        """Adapt 1-channel grayscale face input to 3-channel for ConvNeXtV2.
-        The first conv layer expects 3 channels. We replicate the grayscale
-        channel 3 times. The model learns to treat this as a geometric-only signal
-        since high-frequency texture/color is removed by blurring.
-        """
         if x.shape[1] == 1:
             x = x.repeat(1, 3, 1, 1)
         return x
     def forward(self, left_eye, right_eye, face_blurred_gray):
-        """
-        Args:
-            left_eye: [B, 3, 112, 112]
-            right_eye: [B, 3, 112, 112]
-            face_blurred_gray: [B, 1, 224, 224]
-        Returns:
-            pitch_pred: [B] gaze pitch in degrees
-            yaw_pred: [B] gaze yaw in degrees
-            fused_features: [B, feature_dim]
-        """
-        # Extract features from each modality
-        left_feat = self.eye_extractor(left_eye)        # [B, dim]
-        right_feat = self.eye_extractor(right_eye)       # [B, dim]
         face_input = self._adapt_face_input(face_blurred_gray)
-        face_feat = self.face_extractor(face_input)      # [B, dim]
-        # Fuse eye features
-        eye_combined = torch.cat([left_feat, right_feat], dim=-1)  # [B, 2*dim]
-        eye_fused = self.eye_fusion(eye_combined)                   # [B, dim]
-        # Stack eye features for cross-attention
-        eye_stacked = torch.stack([left_feat, right_feat], dim=1)   # [B, 2, dim]
-        # Cross-modal fusion: face attends to eye features
-        fused = self.cross_fusion(face_feat, eye_stacked)           # [B, dim]
-        # Add eye fused features
-        fused = fused + eye_fused  # residual connection
-        # Predict gaze angles using L2CS-Net style binned regression
-        pitch_logits = self.pitch_head(fused)  # [B, gaze_bins]
-        yaw_logits = self.yaw_head(fused)      # [B, gaze_bins]
-        # Softmax + expectation for fine-grained regression
         pitch_probs = F.softmax(pitch_logits, dim=-1)
         yaw_probs = F.softmax(yaw_logits, dim=-1)
-        pitch_pred = (pitch_probs * self.bin_centers).sum(dim=-1)  # [B]
-        yaw_pred = (yaw_probs * self.bin_centers).sum(dim=-1)      # [B]
-        return pitch_pred, yaw_pred, fused
     def get_penultimate_features(self, left_eye, right_eye, face_blurred_gray):
-        """Return features before the regression heads for distillation."""
-        _, _, fused = self.forward(left_eye, right_eye, face_blurred_gray)
         return fused

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from transformers import ConvNextV2Model
 class ConvNextV2FeatureExtractor(nn.Module):
     def __init__(self, model_name: str, output_dim: int = 256):
         super().__init__()
         self.backbone = ConvNextV2Model.from_pretrained(model_name)
+        self.backbone.gradient_checkpointing_enable()
         hidden_size = self.backbone.config.hidden_sizes[-1]
         self.projection = nn.Sequential(
             nn.LayerNorm(hidden_size),
             nn.GELU(),
         )
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         outputs = self.backbone(x)
+        pooled = outputs.pooler_output
         return self.projection(pooled)
         )
     def forward(self, face_feat: torch.Tensor, eye_feats: torch.Tensor) -> torch.Tensor:
+        face_seq = face_feat.unsqueeze(1)
+        attn_out, _ = self.cross_attn(face_seq, eye_feats, eye_feats)
         out = self.norm1(face_seq + attn_out)
         out = self.norm2(out + self.ffn(out))
+        return out.squeeze(1)
 class PriviGazeTeacher(nn.Module):
     Inputs:
         - left_eye: [B, 3, 112, 112] RGB left eye crop
         - right_eye: [B, 3, 112, 112] RGB right eye crop
+        - face_blurred_gray: [B, 1, 224, 224] Blurred grayscale face
     Outputs:
+        - pitch_pred: [B] gaze pitch angle in degrees
+        - yaw_pred: [B] gaze yaw angle in degrees
+        - pitch_logits: [B, gaze_bins] for logit distillation
+        - yaw_logits: [B, gaze_bins] for logit distillation
         - features: [B, 256] fused feature representation for distillation
     """
         eye_backbone: str = "facebook/convnextv2-atto-1k-224",
         face_backbone: str = "facebook/convnextv2-nano-22k-384",
         feature_dim: int = 256,
+        gaze_bins: int = 90,
     ):
         super().__init__()
         self.eye_extractor = ConvNextV2FeatureExtractor(eye_backbone, feature_dim)
         self.face_extractor = ConvNextV2FeatureExtractor(face_backbone, feature_dim)
         self.eye_fusion = nn.Sequential(
             nn.Linear(feature_dim * 2, feature_dim),
             nn.GELU(),
             nn.LayerNorm(feature_dim),
         )
         self.cross_fusion = CrossAttentionFusion(feature_dim, num_heads=4)
         self.pitch_head = nn.Sequential(
             nn.Linear(feature_dim, feature_dim // 2),
             nn.GELU(),
             nn.Dropout(0.1),
+            nn.Linear(feature_dim // 2, gaze_bins),
         )
         self.yaw_head = nn.Sequential(
             nn.Linear(feature_dim, feature_dim // 2),
             nn.GELU(),
             nn.Dropout(0.1),
+            nn.Linear(feature_dim // 2, gaze_bins),
         )
+        self.register_buffer('bin_centers', torch.linspace(-90.0, 90.0, gaze_bins))
         self.feature_dim = feature_dim
         self.gaze_bins = gaze_bins
     def _adapt_face_input(self, x: torch.Tensor) -> torch.Tensor:
         if x.shape[1] == 1:
             x = x.repeat(1, 3, 1, 1)
         return x
     def forward(self, left_eye, right_eye, face_blurred_gray):
+        left_feat = self.eye_extractor(left_eye)
+        right_feat = self.eye_extractor(right_eye)
         face_input = self._adapt_face_input(face_blurred_gray)
+        face_feat = self.face_extractor(face_input)
+        eye_combined = torch.cat([left_feat, right_feat], dim=-1)
+        eye_fused = self.eye_fusion(eye_combined)
+        eye_stacked = torch.stack([left_feat, right_feat], dim=1)
+        fused = self.cross_fusion(face_feat, eye_stacked)
+        fused = fused + eye_fused
+        pitch_logits = self.pitch_head(fused)
+        yaw_logits = self.yaw_head(fused)
         pitch_probs = F.softmax(pitch_logits, dim=-1)
         yaw_probs = F.softmax(yaw_logits, dim=-1)
+        pitch_pred = (pitch_probs * self.bin_centers).sum(dim=-1)
+        yaw_pred = (yaw_probs * self.bin_centers).sum(dim=-1)
+        return pitch_pred, yaw_pred, pitch_logits, yaw_logits, fused
     def get_penultimate_features(self, left_eye, right_eye, face_blurred_gray):
+        _, _, _, _, fused = self.forward(left_eye, right_eye, face_blurred_gray)
         return fused