bdck
/

point-sam-inference

ml-intern

Model card Files Files and versions

xet

Community

bdck commited on 1 day ago

Commit

ac6e542

verified ·

1 Parent(s): 7f9bc92

Upload point_sam/model/pc_encoder.py

Browse files

Files changed (1) hide show

point_sam/model/pc_encoder.py +198 -0

point_sam/model/pc_encoder.py ADDED Viewed

	@@ -0,0 +1,198 @@

+# https://github.com/baaivision/Uni3D/blob/main/models/point_encoder.py
+from typing import Union
+import timm
+import torch
+import torch.nn as nn
+from timm.models.eva import Eva
+from timm.models.vision_transformer import VisionTransformer
+from .common import KNNGrouper, NNGrouper, PatchEncoder
+class PatchEmbed(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        num_patches,
+        patch_size,
+        radius: float = None,
+        centralize_features=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.grouper = KNNGrouper(
+            num_patches,
+            patch_size,
+            radius=radius,
+            centralize_features=centralize_features,
+        )
+        self.patch_encoder = PatchEncoder(in_channels, out_channels, [128, 512])
+    def forward(self, coords: torch.Tensor, features: torch.Tensor):
+        patches = self.grouper(coords, features)
+        patch_features = patches["features"]  # [B, L, K, C_in]
+        x = self.patch_encoder(patch_features)
+        patches["embeddings"] = x
+        return patches
+class PatchDropout(nn.Module):
+    """Randomly drop patches.
+    References:
+    - https://arxiv.org/abs/2212.00794
+    - `timm.layers.patch_dropout`. It uses `argsort` rather than `topk`, which might be inefficient.
+    """
+    def __init__(self, prob, num_prefix_tokens: int = 1):
+        super().__init__()
+        assert 0.0 <= prob < 1.0, prob
+        self.prob = prob
+        # exclude CLS token (or other prefix tokens)
+        self.num_prefix_tokens = num_prefix_tokens
+    def forward(self, x: torch.Tensor):
+        # x: [B, L, ...]
+        if not self.training or self.prob == 0.0:
+            return x
+        if self.num_prefix_tokens:
+            prefix_tokens = x[:, : self.num_prefix_tokens]
+            x = x[:, self.num_prefix_tokens :]
+        else:
+            prefix_tokens = None
+        B, L = x.shape[:2]
+        num_keep = max(1, int(L * (1.0 - self.prob)))
+        rand = torch.randn(B, L, device=x.device)
+        keep_indices = rand.topk(num_keep, dim=1).indices
+        _keep_indices = keep_indices.reshape((B, num_keep) + (-1,) * (x.dim() - 2))
+        _keep_indices = _keep_indices.expand((-1, -1) + x.shape[2:])
+        x = x.gather(1, _keep_indices)
+        if prefix_tokens is not None:
+            x = torch.cat((prefix_tokens, x), dim=1)
+        return x
+class PointCloudEncoder(nn.Module):
+    def __init__(
+        self,
+        patch_embed: PatchEmbed,
+        transformer: Union[VisionTransformer, Eva],
+        embed_dim: int,
+        patch_drop_rate=0.0,
+    ):
+        super().__init__()
+        self.transformer_dim = transformer.embed_dim
+        self.embed_dim = embed_dim
+        # Patch embedding
+        self.patch_embed = patch_embed
+        # Project patch features to transformer input dim
+        self.patch_proj = nn.Linear(self.patch_embed.out_channels, self.transformer_dim)
+        # Positional embedding
+        self.pos_embed = nn.Sequential(
+            nn.Linear(3, 128), nn.GELU(), nn.Linear(128, self.transformer_dim)
+        )
+        assert patch_drop_rate == 0, "PatchDropout is not compatible with decoder."
+        if patch_drop_rate > 0:
+            self.patch_dropout = PatchDropout(patch_drop_rate, num_prefix_tokens=0)
+        else:
+            self.patch_dropout = nn.Identity()
+        # Transformer encoder
+        self.transformer = transformer
+        # Project transformer output to embedding dim
+        self.out_proj = nn.Linear(self.transformer_dim, self.embed_dim)
+    def forward(self, coords, features):
+        # Group points into patches and get embeddings
+        patches = self.patch_embed(coords, features)
+        if isinstance(patches, list):
+            patch_embed = patches[-1]["embeddings"]
+            centers = patches[-1]["centers"]
+        else:
+            patch_embed = patches["embeddings"]  # [B, L, D]
+            centers = patches["centers"]  # [B, L, 3]
+        patch_embed = self.patch_proj(patch_embed)
+        # Positional embedding for patches
+        pos_embed = self.pos_embed(centers)
+        x = patch_embed + pos_embed
+        # Dropout patch
+        x = self.patch_dropout(x)
+        # Dropout features
+        x = self.transformer.pos_drop(x)
+        for block in self.transformer.blocks:
+            x = block(x)
+        # In fact, only norm or fc_norm is not identity in those transformers.
+        x = self.transformer.norm(x)
+        x = self.transformer.fc_norm(x)
+        x = self.out_proj(x)
+        return x, patches
+class Block(nn.Module):
+    def __init__(self, in_channels, hidden_dim, out_channels):
+        super().__init__()
+        # Follow timm.layers.mlp
+        self.mlp = nn.Sequential(
+            nn.Linear(in_channels, hidden_dim),
+            nn.GELU(),
+            nn.LayerNorm(hidden_dim),
+            nn.Linear(hidden_dim, out_channels),
+        )
+        self.norm = nn.LayerNorm(out_channels)
+    def forward(self, x):
+        # PreLN. Follow timm.models.vision_transformer
+        return x + self.mlp(self.norm(x))
+class PatchEmbedNN(nn.Module):
+    def __init__(self, in_channels, hidden_dim, out_channels, num_patches) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        hidden_dim = hidden_dim or out_channels
+        self.grouper = NNGrouper(num_patches)
+        self.in_proj = nn.Linear(in_channels, hidden_dim)
+        self.blocks1 = nn.Sequential(
+            *[Block(hidden_dim, hidden_dim, hidden_dim) for _ in range(3)]
+        )
+        self.blocks2 = nn.Sequential(
+            *[Block(hidden_dim, hidden_dim, hidden_dim) for _ in range(3)]
+        )
+        self.norm = nn.LayerNorm(hidden_dim)
+        self.out_proj = nn.Linear(hidden_dim, out_channels)
+    def forward(self, coords: torch.tensor, features: torch.tensor):
+        patches = self.grouper(coords, features)
+        patch_features = patches["features"]  # [B, N, D]
+        nn_idx = patches["nn_idx"]  # [B, N]
+        x = self.in_proj(patch_features)
+        x = self.blocks1(x)  # [B, N, D]
+        y = x.new_zeros(x.shape[0], self.grouper.num_groups, x.shape[-1])
+        y.scatter_reduce_(
+            1, nn_idx.unsqueeze(-1).expand_as(x), x, "amax", include_self=False
+        )
+        x = self.blocks2(y)
+        x = self.norm(x)
+        x = self.out_proj(x)
+        patches["embeddings"] = x
+        return patches