BcantCode
/

privi-gaze-distill

Model card Files Files and versions

xet

Community

BcantCode commited on 5 days ago

Commit

0607636

verified ·

1 Parent(s): e97351b

Upload models/student.py

Browse files

Files changed (1) hide show

models/student.py +313 -0

models/student.py ADDED Viewed

	@@ -0,0 +1,313 @@

+"""
+PriviGaze Student Model - Ultra-Compact Gaze Estimation CNN
+Design Philosophy:
+- ~50-100K parameters total (target: fit on microcontrollers with TinyML)
+- Inception blocks with factorized convolutions (exploits eye biology)
+- Only takes light-corrected grayscale face as input (no eye crops)
+- Designed for on-device inference with <1ms latency
+Architecture inspired by:
+- "One Eye is All You Need" (Athavale et al., 2022) - Inception for gaze
+- DFT Gaze from GazeGen (281K params, distillation from 10x larger teacher)
+- Eye biology: horizontal/vertical edge detectors mimic ocular muscle structure
+Key design choices for disability support:
+- Grayscale only: robust to varied lighting/occlusion
+- Large receptive field: handles droopy eyes, head roll
+- Factorized convolutions: 1x3 + 3x1 instead of 3x3 for efficiency
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def conv_bn_relu(in_ch, out_ch, kernel_size, stride=1, padding=0, groups=1):
+    """Standard Conv-BN-ReLU block."""
+    return nn.Sequential(
+        nn.Conv2d(in_ch, out_ch, kernel_size, stride, padding, groups=groups, bias=False),
+        nn.BatchNorm2d(out_ch),
+        nn.ReLU(inplace=True),
+    )
+def depthwise_separable_conv(in_ch, out_ch, kernel_size, stride=1, padding=0):
+    """Depthwise separable convolution for extreme parameter efficiency."""
+    return nn.Sequential(
+        # Depthwise
+        nn.Conv2d(in_ch, in_ch, kernel_size, stride, padding, groups=in_ch, bias=False),
+        nn.BatchNorm2d(in_ch),
+        nn.ReLU(inplace=True),
+        # Pointwise
+        nn.Conv2d(in_ch, out_ch, 1, bias=False),
+        nn.BatchNorm2d(out_ch),
+        nn.ReLU(inplace=True),
+    )
+class FactorizedConvBlock(nn.Module):
+    """Factorized 3x3 into 1x3 + 3x1 for efficiency.
+    This mimics the ocular muscle structure: horizontal rectus (1x3)
+    and vertical rectus (3x1) for detecting gaze direction.
+    """
+    def __init__(self, in_ch, out_ch, stride=1):
+        super().__init__()
+        mid_ch = out_ch // 2
+        self.horizontal = nn.Sequential(
+            nn.Conv2d(in_ch, mid_ch, (1, 3), stride, (0, 1), bias=False),
+            nn.BatchNorm2d(mid_ch),
+            nn.ReLU(inplace=True),
+        )
+        self.vertical = nn.Sequential(
+            nn.Conv2d(in_ch, out_ch - mid_ch, (3, 1), stride, (1, 0), bias=False),
+            nn.BatchNorm2d(out_ch - mid_ch),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, x):
+        h = self.horizontal(x)
+        v = self.vertical(x)
+        return torch.cat([h, v], dim=1)
+class InceptionBlock(nn.Module):
+    """Lightweight inception block with factorized convolutions.
+    Branches:
+    1. 1x1 conv (pointwise - captures color/illumination)
+    2. Factorized 1x3 + 3x1 (edge detection in H/V directions)
+    3. MaxPool + 1x1 (spatial context)
+    4. 2x 3x3 (standard feature extraction)
+    All branches use depthwise separable convolutions for efficiency.
+    """
+    def __init__(self, in_ch, out_ch):
+        super().__init__()
+        # Each branch outputs out_ch // 4
+        branch_ch = max(out_ch // 4, 8)
+        # Branch 1: 1x1 pointwise
+        self.branch1 = nn.Sequential(
+            nn.Conv2d(in_ch, branch_ch, 1, bias=False),
+            nn.BatchNorm2d(branch_ch),
+            nn.ReLU(inplace=True),
+        )
+        # Branch 2: Factorized 1x3 + 3x1
+        self.branch2 = FactorizedConvBlock(in_ch, branch_ch)
+        # Branch 3: MaxPool + 1x1
+        self.branch3 = nn.Sequential(
+            nn.MaxPool2d(3, stride=1, padding=1),
+            nn.Conv2d(in_ch, branch_ch, 1, bias=False),
+            nn.BatchNorm2d(branch_ch),
+            nn.ReLU(inplace=True),
+        )
+        # Branch 4: Two stacked 3x3 depthwise separable
+        self.branch4 = nn.Sequential(
+            depthwise_separable_conv(in_ch, branch_ch, 3, padding=1),
+            depthwise_separable_conv(branch_ch, branch_ch, 3, padding=1),
+        )
+        # Final fusion
+        total_ch = branch_ch * 4
+        self.fusion = nn.Sequential(
+            nn.Conv2d(total_ch, out_ch, 1, bias=False),
+            nn.BatchNorm2d(out_ch),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, x):
+        b1 = self.branch1(x)
+        b2 = self.branch2(x)
+        b3 = self.branch3(x)
+        b4 = self.branch4(x)
+        out = torch.cat([b1, b2, b3, b4], dim=1)
+        return self.fusion(out)
+class LightCorrection(nn.Module):
+    """Learnable light correction for grayscale face input.
+    Applies per-channel (single channel for grayscale) affine transform
+    to normalize lighting variations. This is critical for disability support
+    where users may be in varied lighting conditions.
+    """
+    def __init__(self):
+        super().__init__()
+        # Learnable gamma correction parameter
+        self.gamma = nn.Parameter(torch.ones(1))
+        self.alpha = nn.Parameter(torch.ones(1))
+        self.beta = nn.Parameter(torch.zeros(1))
+    def forward(self, x):
+        # x: [B, 1, H, W]
+        # Apply gamma correction
+        x = torch.pow(x.clamp(min=1e-6), self.gamma)
+        # Apply affine transform
+        x = self.alpha * x + self.beta
+        return x
+class PriviGazeStudent(nn.Module):
+    """Ultra-compact student model for on-device gaze estimation.
+    Input:
+        - face_gray: [B, 1, 224, 224] Light-corrected grayscale face
+    Output:
+        - pitch_pred: [B] gaze pitch in degrees
+        - yaw_pred: [B] gaze yaw in degrees
+        - features: [B, 128] feature representation (for distillation matching)
+    Target: ~80K parameters, <1ms inference on mobile
+    """
+    def __init__(
+        self,
+        input_channels: int = 1,  # Grayscale
+        feature_dim: int = 128,
+        gaze_bins: int = 90,
+    ):
+        super().__init__()
+        # Light correction
+        self.light_correction = LightCorrection()
+        # Stem: initial feature extraction
+        self.stem = nn.Sequential(
+            nn.Conv2d(input_channels, 32, 3, stride=2, padding=1, bias=False),
+            nn.BatchNorm2d(32),
+            nn.ReLU(inplace=True),
+            # 32 x 112 x 112
+            depthwise_separable_conv(32, 32, 3, stride=2, padding=1),
+            # 32 x 56 x 56
+        )
+        # Stage 1: Inception blocks at high resolution
+        self.stage1 = nn.Sequential(
+            InceptionBlock(32, 64),
+            # 64 x 56 x 56
+            depthwise_separable_conv(64, 64, 3, stride=2, padding=1),
+            # 64 x 28 x 28
+        )
+        # Stage 2: Deeper features
+        self.stage2 = nn.Sequential(
+            InceptionBlock(64, 96),
+            # 96 x 28 x 28
+            depthwise_separable_conv(96, 96, 3, stride=2, padding=1),
+            # 96 x 14 x 14
+        )
+        # Stage 3: Abstract features
+        self.stage3 = nn.Sequential(
+            InceptionBlock(96, 128),
+            # 128 x 14 x 14
+            depthwise_separable_conv(128, 128, 3, stride=2, padding=1),
+            # 128 x 7 x 7
+        )
+        # Stage 4: Global context
+        self.stage4 = nn.Sequential(
+            InceptionBlock(128, 160),
+            # 160 x 7 x 7
+            nn.AdaptiveAvgPool2d(1),
+            # 160 x 1 x 1
+        )
+        # Feature projection
+        self.feature_projection = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(160, feature_dim),
+            nn.GELU(),
+            nn.LayerNorm(feature_dim),
+        )
+        # Gaze regression heads (L2CS-Net style: per-angle binned regression)
+        self.pitch_head = nn.Sequential(
+            nn.Linear(feature_dim, feature_dim // 2),
+            nn.GELU(),
+            nn.Linear(feature_dim // 2, gaze_bins),
+        )
+        self.yaw_head = nn.Sequential(
+            nn.Linear(feature_dim, feature_dim // 2),
+            nn.GELU(),
+            nn.Linear(feature_dim // 2, gaze_bins),
+        )
+        # Bin centers for expectation-based regression
+        self.register_buffer(
+            'bin_centers',
+            torch.linspace(-90.0, 90.0, gaze_bins)
+        )
+        self.feature_dim = feature_dim
+        self.gaze_bins = gaze_bins
+    def forward(self, face_gray):
+        """
+        Args:
+            face_gray: [B, 1, 224, 224] grayscale face image
+        Returns:
+            pitch_pred: [B]
+            yaw_pred: [B]
+            features: [B, feature_dim]
+        """
+        # Light correction
+        x = self.light_correction(face_gray)
+        # Stem
+        x = self.stem(x)
+        # Stages
+        x = self.stage1(x)
+        x = self.stage2(x)
+        x = self.stage3(x)
+        x = self.stage4(x)
+        # Feature projection
+        features = self.feature_projection(x)
+        # Gaze prediction
+        pitch_logits = self.pitch_head(features)
+        yaw_logits = self.yaw_head(features)
+        pitch_probs = F.softmax(pitch_logits, dim=-1)
+        yaw_probs = F.softmax(yaw_logits, dim=-1)
+        pitch_pred = (pitch_probs * self.bin_centers).sum(dim=-1)
+        yaw_pred = (yaw_probs * self.bin_centers).sum(dim=-1)
+        return pitch_pred, yaw_pred, features
+    def get_penultimate_features(self, face_gray):
+        """Return features before regression heads for distillation."""
+        _, _, features = self.forward(face_gray)
+        return features
+def count_parameters(model):
+    """Count trainable parameters."""
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+if __name__ == "__main__":
+    model = PriviGazeStudent()
+    params = count_parameters(model)
+    print(f"Student model parameters: {params:,}")
+    # Test forward pass
+    dummy = torch.randn(1, 1, 224, 224)
+    pitch, yaw, features = model(dummy)
+    print(f"Pitch: {pitch.shape}, Yaw: {yaw.shape}, Features: {features.shape}")