BcantCode
/

privi-gaze-distill

Model card Files Files and versions

xet

Community

BcantCode commited on 5 days ago

Commit

94cb2c0

verified ·

1 Parent(s): 0607636

Upload models/distillation_loss.py

Browse files

Files changed (1) hide show

models/distillation_loss.py +304 -0

models/distillation_loss.py ADDED Viewed

	@@ -0,0 +1,304 @@

+"""
+PriviGaze Distillation Loss - Privileged Knowledge Distillation for Gaze Estimation
+Key components:
+1. Angular gaze loss (L1 on pitch/yaw in degrees)
+2. L2CS-Net style binned classification + regression loss
+3. Feature-level distillation (WCoRD-inspired contrastive + distribution matching)
+4. Logit-level distillation (KL on soft targets from teacher)
+The teacher has access to privileged information (RGB eye crops, high-res face)
+that the student does NOT have at inference time.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class L2CSLoss(nn.Module):
+    """L2CS-Net style combined classification + regression loss per angle.
+    From "L2CS-Net: Fine-Grained Gaze Estimation in Unconstrained Environments"
+    (Abdelrahman et al., 2022)
+    Loss = CrossEntropy(binned_logits, binned_target) + beta * MSE(continuous_pred, continuous_target)
+    """
+    def __init__(self, gaze_bins: int = 90, beta: float = 1.0):
+        super().__init__()
+        self.gaze_bins = gaze_bins
+        self.beta = beta
+        self.register_buffer(
+            'bin_centers',
+            torch.linspace(-90.0, 90.0, gaze_bins)
+        )
+        self.ce_loss = nn.CrossEntropyLoss()
+    def _angle_to_bins(self, angles: torch.Tensor) -> torch.Tensor:
+        """Convert continuous angle to bin index."""
+        angles_clamped = angles.clamp(-90.0, 90.0)
+        bin_width = 180.0 / (self.gaze_bins - 1)
+        bins = ((angles_clamped + 90.0) / bin_width).long()
+        return bins.clamp(0, self.gaze_bins - 1)
+    def forward(self, logits, continuous_pred, angle_target):
+        """
+        Args:
+            logits: [B, gaze_bins] - classification logits
+            continuous_pred: [B] - continuous angle prediction
+            angle_target: [B] - ground truth angle in degrees
+        Returns:
+            loss: scalar
+        """
+        bin_targets = self._angle_to_bins(angle_target)
+        ce = self.ce_loss(logits, bin_targets)
+        mse = F.mse_loss(continuous_pred, angle_target)
+        return ce + self.beta * mse
+class AngularLoss(nn.Module):
+    """Direct angular error loss in degrees.
+    Computes L1 loss on pitch and yaw predictions.
+    This is the standard metric for gaze estimation.
+    """
+    def __init__(self, reduction: str = 'mean'):
+        super().__init__()
+        self.reduction = reduction
+    def forward(self, pitch_pred, yaw_pred, pitch_target, yaw_target):
+        """
+        Args:
+            pitch_pred: [B]
+            yaw_pred: [B]
+            pitch_target: [B]
+            yaw_target: [B]
+        Returns:
+            loss: scalar (mean angular error in degrees)
+        """
+        pitch_loss = F.l1_loss(pitch_pred, pitch_target, reduction=self.reduction)
+        yaw_loss = F.l1_loss(yaw_pred, yaw_target, reduction=self.reduction)
+        return pitch_loss + yaw_loss
+class ContrastiveDistillationLoss(nn.Module):
+    """WCoRD-inspired contrastive feature distillation.
+    Maximizes mutual information between teacher and student feature
+    representations using InfoNCE contrastive loss.
+    From "Wasserstein Contrastive Representation Distillation" (Chen et al., 2020)
+    """
+    def __init__(self, feature_dim: int = 256, proj_dim: int = 128, temperature: float = 0.1):
+        super().__init__()
+        # Project both teacher and student features to shared space
+        self.teacher_proj = nn.Sequential(
+            nn.Linear(feature_dim, proj_dim),
+            nn.GELU(),
+            nn.Linear(proj_dim, proj_dim),
+        )
+        self.student_proj = nn.Sequential(
+            nn.Linear(128, proj_dim),  # student has smaller feature dim
+            nn.GELU(),
+            nn.Linear(proj_dim, proj_dim),
+        )
+        self.temperature = temperature
+    def forward(self, teacher_feat: torch.Tensor, student_feat: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            teacher_feat: [B, feature_dim] - teacher's penultimate features
+            student_feat: [B, 128] - student's penultimate features
+        Returns:
+            contrastive_loss: scalar
+        """
+        # Project to shared space
+        t = F.normalize(self.teacher_proj(teacher_feat), dim=-1)  # [B, proj_dim]
+        s = F.normalize(self.student_proj(student_feat), dim=-1)   # [B, proj_dim]
+        # Compute similarity matrix
+        # Positive pairs: (t_i, s_i) for all i
+        # Negative pairs: (t_i, s_j) for i != j
+        logits = torch.matmul(t, s.T) / self.temperature  # [B, B]
+        # InfoNCE loss: each teacher feature should match its corresponding student
+        labels = torch.arange(logits.shape[0], device=logits.device)
+        # Symmetric loss: teacher -> student and student -> teacher
+        loss_t2s = F.cross_entropy(logits, labels)
+        loss_s2t = F.cross_entropy(logits.T, labels)
+        return (loss_t2s + loss_s2t) / 2.0
+class DistributionMatchingLoss(nn.Module):
+    """Distribution matching loss for feature-level knowledge transfer.
+    Uses Maximum Mean Discrepancy (MMD) to match feature distributions
+    between teacher and student. This is a simpler alternative to
+    Wasserstein/Sinkhorn while still effective.
+    """
+    def __init__(self, kernel: str = 'rbf'):
+        super().__init__()
+        self.kernel = kernel
+    def _rbf_kernel(self, x: torch.Tensor, y: torch.Tensor, sigma: float = 1.0) -> torch.Tensor:
+        """RBF kernel between two sets of features."""
+        xx = torch.matmul(x, x.T)
+        yy = torch.matmul(y, y.T)
+        xy = torch.matmul(x, y.T)
+        rx = xx.diag().unsqueeze(0)
+        ry = yy.diag().unsqueeze(0)
+        k_xx = torch.exp(- (rx + rx.T - 2 * xx) / (2 * sigma ** 2))
+        k_yy = torch.exp(- (ry + ry.T - 2 * yy) / (2 * sigma ** 2))
+        k_xy = torch.exp(- (rx + ry.T - 2 * xy) / (2 * sigma ** 2))
+        return k_xx.mean() + k_yy.mean() - 2 * k_xy.mean()
+    def forward(self, teacher_feat: torch.Tensor, student_feat: torch.Tensor) -> torch.Tensor:
+        """Compute MMD between teacher and student feature distributions."""
+        t = F.normalize(teacher_feat, dim=-1)
+        s = F.normalize(student_feat, dim=-1)
+        return self._rbf_kernel(t, s)
+class LogitDistillationLoss(nn.Module):
+    """KL divergence distillation on output gaze predictions.
+    Standard knowledge distillation: student learns to mimic teacher's
+    soft probability distribution over gaze bins.
+    """
+    def __init__(self, temperature: float = 3.0):
+        super().__init__()
+        self.temperature = temperature
+    def forward(self, student_logits, teacher_logits):
+        """
+        Args:
+            student_logits: [B, gaze_bins]
+            teacher_logits: [B, gaze_bins] (detached)
+        Returns:
+            kl_loss: scalar
+        """
+        student_soft = F.log_softmax(student_logits / self.temperature, dim=-1)
+        teacher_soft = F.softmax(teacher_logits / self.temperature, dim=-1)
+        return F.kl_div(student_soft, teacher_soft, reduction='batchmean') * (self.temperature ** 2)
+class PriviGazeDistillationLoss(nn.Module):
+    """Complete privileged distillation loss for gaze estimation.
+    Total loss = alpha_task * L_task
+                + alpha_angular * L_angular
+                + alpha_contrastive * L_contrastive
+                + alpha_mmd * L_mmd
+                + alpha_logit * L_logit
+    Task losses: L2CS-Net binned regression on student predictions
+    Angular losses: Direct L1 on pitch/yaw
+    Contrastive: Feature-level mutual information maximization
+    MMD: Distribution matching
+    Logit: Soft target distillation
+    """
+    def __init__(
+        self,
+        gaze_bins: int = 90,
+        teacher_feature_dim: int = 256,
+        student_feature_dim: int = 128,
+        alpha_angular: float = 1.0,
+        alpha_contrastive: float = 0.5,
+        alpha_mmd: float = 0.1,
+        alpha_logit: float = 0.5,
+    ):
+        super().__init__()
+        self.angular_loss = AngularLoss()
+        self.pitch_l2cs = L2CSLoss(gaze_bins)
+        self.yaw_l2cs = L2CSLoss(gaze_bins)
+        self.contrastive_loss = ContrastiveDistillationLoss(
+            teacher_feature_dim, student_feature_dim
+        )
+        self.mmd_loss = DistributionMatchingLoss()
+        self.logit_loss = LogitDistillationLoss()
+        self.alpha_angular = alpha_angular
+        self.alpha_contrastive = alpha_contrastive
+        self.alpha_mmd = alpha_mmd
+        self.alpha_logit = alpha_logit
+    def forward(
+        self,
+        student_pitch: torch.Tensor,
+        student_yaw: torch.Tensor,
+        student_pitch_logits: torch.Tensor,
+        student_yaw_logits: torch.Tensor,
+        student_features: torch.Tensor,
+        teacher_pitch: torch.Tensor,
+        teacher_yaw: torch.Tensor,
+        teacher_pitch_logits: torch.Tensor,
+        teacher_yaw_logits: torch.Tensor,
+        teacher_features: torch.Tensor,
+        pitch_target: torch.Tensor,
+        yaw_target: torch.Tensor,
+    ):
+        """
+        Returns:
+            total_loss: scalar
+            loss_dict: dict of individual losses for logging
+        """
+        # 1. Task losses (student predictions vs ground truth)
+        task_pitch = self.pitch_l2cs(student_pitch_logits, student_pitch, pitch_target)
+        task_yaw = self.yaw_l2cs(student_yaw_logits, student_yaw, yaw_target)
+        loss_task = task_pitch + task_yaw
+        # 2. Angular loss (direct L1 in degrees)
+        loss_angular = self.alpha_angular * self.angular_loss(
+            student_pitch, student_yaw, pitch_target, yaw_target
+        )
+        # 3. Contrastive feature distillation
+        loss_contrastive = self.alpha_contrastive * self.contrastive_loss(
+            teacher_features.detach(), student_features
+        )
+        # 4. Distribution matching (MMD)
+        loss_mmd = self.alpha_mmd * self.mmd_loss(
+            teacher_features.detach(), student_features
+        )
+        # 5. Logit distillation (teacher soft targets)
+        loss_logit_pitch = self.alpha_logit * self.logit_loss(
+            student_pitch_logits, teacher_pitch_logits.detach()
+        )
+        loss_logit_yaw = self.alpha_logit * self.logit_loss(
+            student_yaw_logits, teacher_yaw_logits.detach()
+        )
+        loss_logit = loss_logit_pitch + loss_logit_yaw
+        # Total
+        total_loss = loss_task + loss_angular + loss_contrastive + loss_mmd + loss_logit
+        loss_dict = {
+            'loss_total': total_loss.item(),
+            'loss_task': loss_task.item(),
+            'loss_angular': loss_angular.item(),
+            'loss_contrastive': loss_contrastive.item(),
+            'loss_mmd': loss_mmd.item(),
+            'loss_logit': loss_logit.item(),
+        }
+        return total_loss, loss_dict