privi-gaze-distill / models /distillation_loss.py

Upload models/distillation_loss.py

fdc4b3d verified 4 days ago

6.81 kB

	"""
	PriviGaze Distillation Loss - Privileged Knowledge Distillation for Gaze Estimation

	Components:
	1. Angular gaze loss (L1 on pitch/yaw in degrees)
	2. L2CS-Net style binned classification + regression loss
	3. Feature-level distillation (WCoRD-inspired contrastive + distribution matching)
	4. Logit-level distillation (KL on soft targets from teacher)
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	class L2CSLoss(nn.Module):
	"""L2CS-Net style combined classification + regression loss per angle.

	Loss = CrossEntropy(binned_logits, binned_target) + beta * MSE(continuous_pred, continuous_target)
	"""

	def __init__(self, gaze_bins: int = 90, beta: float = 1.0):
	super().__init__()
	self.gaze_bins = gaze_bins
	self.beta = beta
	self.register_buffer('bin_centers', torch.linspace(-90.0, 90.0, gaze_bins))
	self.ce_loss = nn.CrossEntropyLoss()

	def _angle_to_bins(self, angles):
	angles_clamped = angles.clamp(-90.0, 90.0)
	bin_width = 180.0 / (self.gaze_bins - 1)
	bins = ((angles_clamped + 90.0) / bin_width).long()
	return bins.clamp(0, self.gaze_bins - 1)

	def forward(self, logits, continuous_pred, angle_target):
	bin_targets = self._angle_to_bins(angle_target)
	ce = self.ce_loss(logits, bin_targets)
	mse = F.mse_loss(continuous_pred, angle_target)
	return ce + self.beta * mse


	class AngularLoss(nn.Module):
	"""Direct angular error loss in degrees (L1 on pitch and yaw)."""

	def __init__(self, reduction: str = 'mean'):
	super().__init__()
	self.reduction = reduction

	def forward(self, pitch_pred, yaw_pred, pitch_target, yaw_target):
	pitch_loss = F.l1_loss(pitch_pred, pitch_target, reduction=self.reduction)
	yaw_loss = F.l1_loss(yaw_pred, yaw_target, reduction=self.reduction)
	return pitch_loss + yaw_loss


	class ContrastiveDistillationLoss(nn.Module):
	"""WCoRD-inspired contrastive feature distillation.

	InfoNCE loss maximizing mutual information between teacher and student features.
	Projects both to a shared space before computing similarity.
	"""

	def __init__(self, teacher_dim: int = 256, student_dim: int = 128, proj_dim: int = 128, temperature: float = 0.1):
	super().__init__()
	self.teacher_proj = nn.Sequential(
	nn.Linear(teacher_dim, proj_dim), nn.GELU(), nn.Linear(proj_dim, proj_dim))
	self.student_proj = nn.Sequential(
	nn.Linear(student_dim, proj_dim), nn.GELU(), nn.Linear(proj_dim, proj_dim))
	self.temperature = temperature

	def forward(self, teacher_feat, student_feat):
	t = F.normalize(self.teacher_proj(teacher_feat), dim=-1)
	s = F.normalize(self.student_proj(student_feat), dim=-1)
	logits = torch.matmul(t, s.T) / self.temperature
	labels = torch.arange(logits.shape[0], device=logits.device)
	loss_t2s = F.cross_entropy(logits, labels)
	loss_s2t = F.cross_entropy(logits.T, labels)
	return (loss_t2s + loss_s2t) / 2.0


	class DistributionMatchingLoss(nn.Module):
	"""MMD-based distribution matching between teacher and student features."""

	def __init__(self, kernel: str = 'rbf'):
	super().__init__()
	self.kernel = kernel

	def _rbf_kernel(self, x, y, sigma=1.0):
	xx = torch.matmul(x, x.T)
	yy = torch.matmul(y, y.T)
	xy = torch.matmul(x, y.T)
	rx = xx.diag().unsqueeze(0)
	ry = yy.diag().unsqueeze(0)
	k_xx = torch.exp(-(rx + rx.T - 2xx) / (2sigma**2))
	k_yy = torch.exp(-(ry + ry.T - 2yy) / (2sigma**2))
	k_xy = torch.exp(-(rx + ry.T - 2xy) / (2sigma**2))
	return k_xx.mean() + k_yy.mean() - 2*k_xy.mean()

	def forward(self, teacher_feat, student_feat):
	t = F.normalize(teacher_feat, dim=-1)
	s = F.normalize(student_feat, dim=-1)
	return self._rbf_kernel(t, s)


	class LogitDistillationLoss(nn.Module):
	"""KL divergence distillation on soft gaze bin probabilities."""

	def __init__(self, temperature: float = 3.0):
	super().__init__()
	self.temperature = temperature

	def forward(self, student_logits, teacher_logits):
	student_soft = F.log_softmax(student_logits / self.temperature, dim=-1)
	teacher_soft = F.softmax(teacher_logits / self.temperature, dim=-1)
	return F.kl_div(student_soft, teacher_soft, reduction='batchmean') * (self.temperature**2)


	class PriviGazeDistillationLoss(nn.Module):
	"""Complete privileged distillation loss for gaze estimation.

	L_total = L_task + α_angular·L_angular + α_contrastive·L_contrastive
	+ α_mmd·L_mmd + α_logit·L_logit
	"""

	def __init__(self, gaze_bins=90, teacher_feature_dim=256, student_feature_dim=128,
	alpha_angular=1.0, alpha_contrastive=0.5, alpha_mmd=0.1, alpha_logit=0.5):
	super().__init__()
	self.angular_loss = AngularLoss()
	self.pitch_l2cs = L2CSLoss(gaze_bins)
	self.yaw_l2cs = L2CSLoss(gaze_bins)
	self.contrastive_loss = ContrastiveDistillationLoss(teacher_feature_dim, student_feature_dim)
	self.mmd_loss = DistributionMatchingLoss()
	self.logit_loss = LogitDistillationLoss()
	self.alpha_angular = alpha_angular
	self.alpha_contrastive = alpha_contrastive
	self.alpha_mmd = alpha_mmd
	self.alpha_logit = alpha_logit

	def forward(self, s_pitch, s_yaw, sp_logits, sy_logits, s_features,
	t_pitch, t_yaw, tp_logits, ty_logits, t_features,
	pitch_target, yaw_target):
	task_pitch = self.pitch_l2cs(sp_logits, s_pitch, pitch_target)
	task_yaw = self.yaw_l2cs(sy_logits, s_yaw, yaw_target)
	loss_task = task_pitch + task_yaw

	loss_angular = self.alpha_angular * self.angular_loss(s_pitch, s_yaw, pitch_target, yaw_target)
	loss_contrastive = self.alpha_contrastive * self.contrastive_loss(t_features.detach(), s_features)
	loss_mmd = self.alpha_mmd * self.mmd_loss(t_features.detach(), s_features)
	loss_logit = (self.alpha_logit * self.logit_loss(sp_logits, tp_logits.detach()) +
	self.alpha_logit * self.logit_loss(sy_logits, ty_logits.detach()))

	total_loss = loss_task + loss_angular + loss_contrastive + loss_mmd + loss_logit

	loss_dict = {
	'loss_total': total_loss.item(),
	'loss_task': loss_task.item(),
	'loss_angular': loss_angular.item(),
	'loss_contrastive': loss_contrastive.item(),
	'loss_mmd': loss_mmd.item(),
	'loss_logit': loss_logit.item(),
	}
	return total_loss, loss_dict