BcantCode
/

privi-gaze-distill

Model card Files Files and versions

xet

Community

BcantCode commited on 4 days ago

Commit

fdc4b3d

verified ·

1 Parent(s): 327e860

Upload models/distillation_loss.py

Browse files

Files changed (1) hide show

models/distillation_loss.py +38 -181

models/distillation_loss.py CHANGED Viewed

@@ -1,14 +1,11 @@
 """
 PriviGaze Distillation Loss - Privileged Knowledge Distillation for Gaze Estimation
-Key components:
 1. Angular gaze loss (L1 on pitch/yaw in degrees)
 2. L2CS-Net style binned classification + regression loss
 3. Feature-level distillation (WCoRD-inspired contrastive + distribution matching)
 4. Logit-level distillation (KL on soft targets from teacher)
-The teacher has access to privileged information (RGB eye crops, high-res face)
-that the student does NOT have at inference time.
 """
 import torch
@@ -19,9 +16,6 @@ import torch.nn.functional as F
 class L2CSLoss(nn.Module):
     """L2CS-Net style combined classification + regression loss per angle.
-    From "L2CS-Net: Fine-Grained Gaze Estimation in Unconstrained Environments"
-    (Abdelrahman et al., 2022)
     Loss = CrossEntropy(binned_logits, binned_target) + beta * MSE(continuous_pred, continuous_target)
     """
@@ -29,29 +23,16 @@ class L2CSLoss(nn.Module):
         super().__init__()
         self.gaze_bins = gaze_bins
         self.beta = beta
-        self.register_buffer(
-            'bin_centers',
-            torch.linspace(-90.0, 90.0, gaze_bins)
-        )
         self.ce_loss = nn.CrossEntropyLoss()
-    def _angle_to_bins(self, angles: torch.Tensor) -> torch.Tensor:
-        """Convert continuous angle to bin index."""
         angles_clamped = angles.clamp(-90.0, 90.0)
         bin_width = 180.0 / (self.gaze_bins - 1)
         bins = ((angles_clamped + 90.0) / bin_width).long()
         return bins.clamp(0, self.gaze_bins - 1)
     def forward(self, logits, continuous_pred, angle_target):
-        """
-        Args:
-            logits: [B, gaze_bins] - classification logits
-            continuous_pred: [B] - continuous angle prediction
-            angle_target: [B] - ground truth angle in degrees
-        Returns:
-            loss: scalar
-        """
         bin_targets = self._angle_to_bins(angle_target)
         ce = self.ce_loss(logits, bin_targets)
         mse = F.mse_loss(continuous_pred, angle_target)
@@ -59,27 +40,13 @@ class L2CSLoss(nn.Module):
 class AngularLoss(nn.Module):
-    """Direct angular error loss in degrees.
-    Computes L1 loss on pitch and yaw predictions.
-    This is the standard metric for gaze estimation.
-    """
     def __init__(self, reduction: str = 'mean'):
         super().__init__()
         self.reduction = reduction
     def forward(self, pitch_pred, yaw_pred, pitch_target, yaw_target):
-        """
-        Args:
-            pitch_pred: [B]
-            yaw_pred: [B]
-            pitch_target: [B]
-            yaw_target: [B]
-        Returns:
-            loss: scalar (mean angular error in degrees)
-        """
         pitch_loss = F.l1_loss(pitch_pred, pitch_target, reduction=self.reduction)
         yaw_loss = F.l1_loss(yaw_pred, yaw_target, reduction=self.reduction)
         return pitch_loss + yaw_loss
@@ -88,208 +55,99 @@ class AngularLoss(nn.Module):
 class ContrastiveDistillationLoss(nn.Module):
     """WCoRD-inspired contrastive feature distillation.
-    Maximizes mutual information between teacher and student feature
-    representations using InfoNCE contrastive loss.
-    From "Wasserstein Contrastive Representation Distillation" (Chen et al., 2020)
     """
-    def __init__(self, feature_dim: int = 256, proj_dim: int = 128, temperature: float = 0.1):
         super().__init__()
-        # Project both teacher and student features to shared space
         self.teacher_proj = nn.Sequential(
-            nn.Linear(feature_dim, proj_dim),
-            nn.GELU(),
-            nn.Linear(proj_dim, proj_dim),
-        )
         self.student_proj = nn.Sequential(
-            nn.Linear(128, proj_dim),  # student has smaller feature dim
-            nn.GELU(),
-            nn.Linear(proj_dim, proj_dim),
-        )
         self.temperature = temperature
-    def forward(self, teacher_feat: torch.Tensor, student_feat: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            teacher_feat: [B, feature_dim] - teacher's penultimate features
-            student_feat: [B, 128] - student's penultimate features
-        Returns:
-            contrastive_loss: scalar
-        """
-        # Project to shared space
-        t = F.normalize(self.teacher_proj(teacher_feat), dim=-1)  # [B, proj_dim]
-        s = F.normalize(self.student_proj(student_feat), dim=-1)   # [B, proj_dim]
-        # Compute similarity matrix
-        # Positive pairs: (t_i, s_i) for all i
-        # Negative pairs: (t_i, s_j) for i != j
-        logits = torch.matmul(t, s.T) / self.temperature  # [B, B]
-        # InfoNCE loss: each teacher feature should match its corresponding student
         labels = torch.arange(logits.shape[0], device=logits.device)
-        # Symmetric loss: teacher -> student and student -> teacher
         loss_t2s = F.cross_entropy(logits, labels)
         loss_s2t = F.cross_entropy(logits.T, labels)
         return (loss_t2s + loss_s2t) / 2.0
 class DistributionMatchingLoss(nn.Module):
-    """Distribution matching loss for feature-level knowledge transfer.
-    Uses Maximum Mean Discrepancy (MMD) to match feature distributions
-    between teacher and student. This is a simpler alternative to
-    Wasserstein/Sinkhorn while still effective.
-    """
     def __init__(self, kernel: str = 'rbf'):
         super().__init__()
         self.kernel = kernel
-    def _rbf_kernel(self, x: torch.Tensor, y: torch.Tensor, sigma: float = 1.0) -> torch.Tensor:
-        """RBF kernel between two sets of features."""
         xx = torch.matmul(x, x.T)
         yy = torch.matmul(y, y.T)
         xy = torch.matmul(x, y.T)
         rx = xx.diag().unsqueeze(0)
         ry = yy.diag().unsqueeze(0)
-        k_xx = torch.exp(- (rx + rx.T - 2 * xx) / (2 * sigma ** 2))
-        k_yy = torch.exp(- (ry + ry.T - 2 * yy) / (2 * sigma ** 2))
-        k_xy = torch.exp(- (rx + ry.T - 2 * xy) / (2 * sigma ** 2))
-        return k_xx.mean() + k_yy.mean() - 2 * k_xy.mean()
-    def forward(self, teacher_feat: torch.Tensor, student_feat: torch.Tensor) -> torch.Tensor:
-        """Compute MMD between teacher and student feature distributions."""
         t = F.normalize(teacher_feat, dim=-1)
         s = F.normalize(student_feat, dim=-1)
         return self._rbf_kernel(t, s)
 class LogitDistillationLoss(nn.Module):
-    """KL divergence distillation on output gaze predictions.
-    Standard knowledge distillation: student learns to mimic teacher's
-    soft probability distribution over gaze bins.
-    """
     def __init__(self, temperature: float = 3.0):
         super().__init__()
         self.temperature = temperature
     def forward(self, student_logits, teacher_logits):
-        """
-        Args:
-            student_logits: [B, gaze_bins]
-            teacher_logits: [B, gaze_bins] (detached)
-        Returns:
-            kl_loss: scalar
-        """
         student_soft = F.log_softmax(student_logits / self.temperature, dim=-1)
         teacher_soft = F.softmax(teacher_logits / self.temperature, dim=-1)
-        return F.kl_div(student_soft, teacher_soft, reduction='batchmean') * (self.temperature ** 2)
 class PriviGazeDistillationLoss(nn.Module):
     """Complete privileged distillation loss for gaze estimation.
-    Total loss = alpha_task * L_task
-                + alpha_angular * L_angular
-                + alpha_contrastive * L_contrastive
-                + alpha_mmd * L_mmd
-                + alpha_logit * L_logit
-    Task losses: L2CS-Net binned regression on student predictions
-    Angular losses: Direct L1 on pitch/yaw
-    Contrastive: Feature-level mutual information maximization
-    MMD: Distribution matching
-    Logit: Soft target distillation
     """
-    def __init__(
-        self,
-        gaze_bins: int = 90,
-        teacher_feature_dim: int = 256,
-        student_feature_dim: int = 128,
-        alpha_angular: float = 1.0,
-        alpha_contrastive: float = 0.5,
-        alpha_mmd: float = 0.1,
-        alpha_logit: float = 0.5,
-    ):
         super().__init__()
         self.angular_loss = AngularLoss()
         self.pitch_l2cs = L2CSLoss(gaze_bins)
         self.yaw_l2cs = L2CSLoss(gaze_bins)
-        self.contrastive_loss = ContrastiveDistillationLoss(
-            teacher_feature_dim, student_feature_dim
-        )
         self.mmd_loss = DistributionMatchingLoss()
         self.logit_loss = LogitDistillationLoss()
         self.alpha_angular = alpha_angular
         self.alpha_contrastive = alpha_contrastive
         self.alpha_mmd = alpha_mmd
         self.alpha_logit = alpha_logit
-    def forward(
-        self,
-        student_pitch: torch.Tensor,
-        student_yaw: torch.Tensor,
-        student_pitch_logits: torch.Tensor,
-        student_yaw_logits: torch.Tensor,
-        student_features: torch.Tensor,
-        teacher_pitch: torch.Tensor,
-        teacher_yaw: torch.Tensor,
-        teacher_pitch_logits: torch.Tensor,
-        teacher_yaw_logits: torch.Tensor,
-        teacher_features: torch.Tensor,
-        pitch_target: torch.Tensor,
-        yaw_target: torch.Tensor,
-    ):
-        """
-        Returns:
-            total_loss: scalar
-            loss_dict: dict of individual losses for logging
-        """
-        # 1. Task losses (student predictions vs ground truth)
-        task_pitch = self.pitch_l2cs(student_pitch_logits, student_pitch, pitch_target)
-        task_yaw = self.yaw_l2cs(student_yaw_logits, student_yaw, yaw_target)
         loss_task = task_pitch + task_yaw
-        # 2. Angular loss (direct L1 in degrees)
-        loss_angular = self.alpha_angular * self.angular_loss(
-            student_pitch, student_yaw, pitch_target, yaw_target
-        )
-        # 3. Contrastive feature distillation
-        loss_contrastive = self.alpha_contrastive * self.contrastive_loss(
-            teacher_features.detach(), student_features
-        )
-        # 4. Distribution matching (MMD)
-        loss_mmd = self.alpha_mmd * self.mmd_loss(
-            teacher_features.detach(), student_features
-        )
-        # 5. Logit distillation (teacher soft targets)
-        loss_logit_pitch = self.alpha_logit * self.logit_loss(
-            student_pitch_logits, teacher_pitch_logits.detach()
-        )
-        loss_logit_yaw = self.alpha_logit * self.logit_loss(
-            student_yaw_logits, teacher_yaw_logits.detach()
-        )
-        loss_logit = loss_logit_pitch + loss_logit_yaw
-        # Total
         total_loss = loss_task + loss_angular + loss_contrastive + loss_mmd + loss_logit
         loss_dict = {
@@ -300,5 +158,4 @@ class PriviGazeDistillationLoss(nn.Module):
             'loss_mmd': loss_mmd.item(),
             'loss_logit': loss_logit.item(),
         }
-        return total_loss, loss_dict

 """
 PriviGaze Distillation Loss - Privileged Knowledge Distillation for Gaze Estimation
+Components:
 1. Angular gaze loss (L1 on pitch/yaw in degrees)
 2. L2CS-Net style binned classification + regression loss
 3. Feature-level distillation (WCoRD-inspired contrastive + distribution matching)
 4. Logit-level distillation (KL on soft targets from teacher)
 """
 import torch
 class L2CSLoss(nn.Module):
     """L2CS-Net style combined classification + regression loss per angle.
     Loss = CrossEntropy(binned_logits, binned_target) + beta * MSE(continuous_pred, continuous_target)
     """
         super().__init__()
         self.gaze_bins = gaze_bins
         self.beta = beta
+        self.register_buffer('bin_centers', torch.linspace(-90.0, 90.0, gaze_bins))
         self.ce_loss = nn.CrossEntropyLoss()
+    def _angle_to_bins(self, angles):
         angles_clamped = angles.clamp(-90.0, 90.0)
         bin_width = 180.0 / (self.gaze_bins - 1)
         bins = ((angles_clamped + 90.0) / bin_width).long()
         return bins.clamp(0, self.gaze_bins - 1)
     def forward(self, logits, continuous_pred, angle_target):
         bin_targets = self._angle_to_bins(angle_target)
         ce = self.ce_loss(logits, bin_targets)
         mse = F.mse_loss(continuous_pred, angle_target)
 class AngularLoss(nn.Module):
+    """Direct angular error loss in degrees (L1 on pitch and yaw)."""
     def __init__(self, reduction: str = 'mean'):
         super().__init__()
         self.reduction = reduction
     def forward(self, pitch_pred, yaw_pred, pitch_target, yaw_target):
         pitch_loss = F.l1_loss(pitch_pred, pitch_target, reduction=self.reduction)
         yaw_loss = F.l1_loss(yaw_pred, yaw_target, reduction=self.reduction)
         return pitch_loss + yaw_loss
 class ContrastiveDistillationLoss(nn.Module):
     """WCoRD-inspired contrastive feature distillation.
+    InfoNCE loss maximizing mutual information between teacher and student features.
+    Projects both to a shared space before computing similarity.
     """
+    def __init__(self, teacher_dim: int = 256, student_dim: int = 128, proj_dim: int = 128, temperature: float = 0.1):
         super().__init__()
         self.teacher_proj = nn.Sequential(
+            nn.Linear(teacher_dim, proj_dim), nn.GELU(), nn.Linear(proj_dim, proj_dim))
         self.student_proj = nn.Sequential(
+            nn.Linear(student_dim, proj_dim), nn.GELU(), nn.Linear(proj_dim, proj_dim))
         self.temperature = temperature
+    def forward(self, teacher_feat, student_feat):
+        t = F.normalize(self.teacher_proj(teacher_feat), dim=-1)
+        s = F.normalize(self.student_proj(student_feat), dim=-1)
+        logits = torch.matmul(t, s.T) / self.temperature
         labels = torch.arange(logits.shape[0], device=logits.device)
         loss_t2s = F.cross_entropy(logits, labels)
         loss_s2t = F.cross_entropy(logits.T, labels)
         return (loss_t2s + loss_s2t) / 2.0
 class DistributionMatchingLoss(nn.Module):
+    """MMD-based distribution matching between teacher and student features."""
     def __init__(self, kernel: str = 'rbf'):
         super().__init__()
         self.kernel = kernel
+    def _rbf_kernel(self, x, y, sigma=1.0):
         xx = torch.matmul(x, x.T)
         yy = torch.matmul(y, y.T)
         xy = torch.matmul(x, y.T)
         rx = xx.diag().unsqueeze(0)
         ry = yy.diag().unsqueeze(0)
+        k_xx = torch.exp(-(rx + rx.T - 2*xx) / (2*sigma**2))
+        k_yy = torch.exp(-(ry + ry.T - 2*yy) / (2*sigma**2))
+        k_xy = torch.exp(-(rx + ry.T - 2*xy) / (2*sigma**2))
+        return k_xx.mean() + k_yy.mean() - 2*k_xy.mean()
+    def forward(self, teacher_feat, student_feat):
         t = F.normalize(teacher_feat, dim=-1)
         s = F.normalize(student_feat, dim=-1)
         return self._rbf_kernel(t, s)
 class LogitDistillationLoss(nn.Module):
+    """KL divergence distillation on soft gaze bin probabilities."""
     def __init__(self, temperature: float = 3.0):
         super().__init__()
         self.temperature = temperature
     def forward(self, student_logits, teacher_logits):
         student_soft = F.log_softmax(student_logits / self.temperature, dim=-1)
         teacher_soft = F.softmax(teacher_logits / self.temperature, dim=-1)
+        return F.kl_div(student_soft, teacher_soft, reduction='batchmean') * (self.temperature**2)
 class PriviGazeDistillationLoss(nn.Module):
     """Complete privileged distillation loss for gaze estimation.
+    L_total = L_task + α_angular·L_angular + α_contrastive·L_contrastive
+            + α_mmd·L_mmd + α_logit·L_logit
     """
+    def __init__(self, gaze_bins=90, teacher_feature_dim=256, student_feature_dim=128,
+                 alpha_angular=1.0, alpha_contrastive=0.5, alpha_mmd=0.1, alpha_logit=0.5):
         super().__init__()
         self.angular_loss = AngularLoss()
         self.pitch_l2cs = L2CSLoss(gaze_bins)
         self.yaw_l2cs = L2CSLoss(gaze_bins)
+        self.contrastive_loss = ContrastiveDistillationLoss(teacher_feature_dim, student_feature_dim)
         self.mmd_loss = DistributionMatchingLoss()
         self.logit_loss = LogitDistillationLoss()
         self.alpha_angular = alpha_angular
         self.alpha_contrastive = alpha_contrastive
         self.alpha_mmd = alpha_mmd
         self.alpha_logit = alpha_logit
+    def forward(self, s_pitch, s_yaw, sp_logits, sy_logits, s_features,
+                t_pitch, t_yaw, tp_logits, ty_logits, t_features,
+                pitch_target, yaw_target):
+        task_pitch = self.pitch_l2cs(sp_logits, s_pitch, pitch_target)
+        task_yaw = self.yaw_l2cs(sy_logits, s_yaw, yaw_target)
         loss_task = task_pitch + task_yaw
+        loss_angular = self.alpha_angular * self.angular_loss(s_pitch, s_yaw, pitch_target, yaw_target)
+        loss_contrastive = self.alpha_contrastive * self.contrastive_loss(t_features.detach(), s_features)
+        loss_mmd = self.alpha_mmd * self.mmd_loss(t_features.detach(), s_features)
+        loss_logit = (self.alpha_logit * self.logit_loss(sp_logits, tp_logits.detach()) +
+                      self.alpha_logit * self.logit_loss(sy_logits, ty_logits.detach()))
         total_loss = loss_task + loss_angular + loss_contrastive + loss_mmd + loss_logit
         loss_dict = {
             'loss_mmd': loss_mmd.item(),
             'loss_logit': loss_logit.item(),
         }
+        return total_loss, loss_dict