Spaces:

TencentARC
/

Pixal3D-D

Runtime error

App Files Files Community

Yang2001 commited on 22 days ago

Commit

c272f3c

verified ·

1 Parent(s): ab8266a

Upload Pixal3D-D Space

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +6 -7
app.py +311 -0
packages.txt +1 -0
pixal3d/__init__.py +44 -0
pixal3d/__pycache__/__init__.cpython-310.pyc +0 -0
pixal3d/models/__init__.py +1 -0
pixal3d/models/__pycache__/__init__.cpython-310.pyc +0 -0
pixal3d/models/autoencoders/__pycache__/base.cpython-310.pyc +0 -0
pixal3d/models/autoencoders/__pycache__/decoder.cpython-310.pyc +0 -0
pixal3d/models/autoencoders/__pycache__/dense_vae.cpython-310.pyc +0 -0
pixal3d/models/autoencoders/__pycache__/distributions.cpython-310.pyc +0 -0
pixal3d/models/autoencoders/__pycache__/encoder.cpython-310.pyc +0 -0
pixal3d/models/autoencoders/__pycache__/ss_vae.cpython-310.pyc +0 -0
pixal3d/models/autoencoders/base.py +118 -0
pixal3d/models/autoencoders/decoder.py +353 -0
pixal3d/models/autoencoders/dense_vae.py +401 -0
pixal3d/models/autoencoders/distributions.py +51 -0
pixal3d/models/autoencoders/encoder.py +133 -0
pixal3d/models/autoencoders/ss_vae.py +129 -0
pixal3d/models/conditional_encoders/__init__.py +2 -0
pixal3d/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc +0 -0
pixal3d/models/conditional_encoders/__pycache__/dinov2_project_grid.cpython-310.pyc +0 -0
pixal3d/models/conditional_encoders/dinov2_project_grid.py +750 -0
pixal3d/models/transformers/__init__.py +2 -0
pixal3d/models/transformers/__pycache__/__init__.cpython-310.pyc +0 -0
pixal3d/models/transformers/__pycache__/dense_dit.cpython-310.pyc +0 -0
pixal3d/models/transformers/__pycache__/sparse_dit.cpython-310.pyc +0 -0
pixal3d/models/transformers/dense_dit.py +298 -0
pixal3d/models/transformers/sparse_dit.py +469 -0
pixal3d/modules/__pycache__/norm.cpython-310.pyc +0 -0
pixal3d/modules/__pycache__/spatial.cpython-310.pyc +0 -0
pixal3d/modules/__pycache__/utils.cpython-310.pyc +0 -0
pixal3d/modules/attention/__init__.py +35 -0
pixal3d/modules/attention/__pycache__/__init__.cpython-310.pyc +0 -0
pixal3d/modules/attention/__pycache__/full_attn.cpython-310.pyc +0 -0
pixal3d/modules/attention/__pycache__/modules.cpython-310.pyc +0 -0
pixal3d/modules/attention/full_attn.py +140 -0
pixal3d/modules/attention/modules.py +164 -0
pixal3d/modules/norm.py +25 -0
pixal3d/modules/sparse/__init__.py +105 -0
pixal3d/modules/sparse/__pycache__/__init__.cpython-310.pyc +0 -0
pixal3d/modules/sparse/__pycache__/basic.cpython-310.pyc +0 -0
pixal3d/modules/sparse/__pycache__/linear.cpython-310.pyc +0 -0
pixal3d/modules/sparse/__pycache__/nonlinearity.cpython-310.pyc +0 -0
pixal3d/modules/sparse/__pycache__/norm.cpython-310.pyc +0 -0
pixal3d/modules/sparse/__pycache__/spatial.cpython-310.pyc +0 -0
pixal3d/modules/sparse/attention/__init__.py +5 -0
pixal3d/modules/sparse/attention/__pycache__/__init__.cpython-310.pyc +0 -0
pixal3d/modules/sparse/attention/__pycache__/full_attn.cpython-310.pyc +0 -0
pixal3d/modules/sparse/attention/__pycache__/modules.cpython-310.pyc +0 -0

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
-title: Pixal3D D
-emoji: 👁
-colorFrom: indigo
-colorTo: blue
 sdk: gradio
-sdk_version: 6.14.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Pixal3D-D
+emoji: 🎨
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 5.29.0
 app_file: app.py
 pinned: false
 license: apache-2.0
+extra_gated_eu_disallowed: true
 ---

app.py ADDED Viewed

	@@ -0,0 +1,311 @@

+"""
+Pixal3D Gradio App
+Upload an image and generate a 3D mesh. Supports both automatic (MoGe) and fixed camera parameters.
+"""
+import os
+os.environ["no_proxy"] = os.environ.get("no_proxy", "") + ",localhost,127.0.0.1"
+import torch
+import tempfile
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+import gradio as gr
+from pixal3dpipeline2stage import Pixal3DPipeline2Stage
+from pixal3dpipeline import Pixal3DPipeline
+import trimesh
+from trimesh.visual.material import PBRMaterial
+from trimesh.transformations import rotation_matrix
+# Static files directory for model viewer
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+SAVE_DIR = os.path.join(CURRENT_DIR, "gradio_outputs")
+# Global pipeline reference
+pipeline = None
+rmbg = None
+def load_pipeline(ckpt_dir="./ckpt", repo_id="Pixal3D/Pixal3D"):
+    """Load all weights at startup."""
+    global pipeline, rmbg
+    print("Loading Pixal3D 2-Stage pipeline (with MoGe + dense_check)...")
+    pipeline = Pixal3DPipeline2Stage.from_pretrained(
+        ckpt_dir=ckpt_dir,
+        repo_id=repo_id,
+        use_moge=True,
+        use_dense_check=True,
+    )
+    print("Pipeline loaded!")
+    print("Loading BiRefNet for background removal...")
+    from transformers import AutoModelForImageSegmentation
+    birefnet_model = AutoModelForImageSegmentation.from_pretrained(
+        'ZhengPeng7/BiRefNet',
+        trust_remote_code=True,
+    ).to("cuda:0")
+    birefnet_model.eval()
+    rmbg = birefnet_model
+    print("BiRefNet loaded!")
+def remove_background(image_np):
+    """Use BiRefNet to remove background and add alpha channel.
+    Input: numpy array (H, W, 3) RGB
+    Output: numpy array (H, W, 4) RGBA
+    """
+    pil_img = Image.fromarray(image_np[:, :, :3]).convert('RGB')
+    image_size = (1024, 1024)
+    transform_image = transforms.Compose([
+        transforms.Resize(image_size),
+        transforms.ToTensor(),
+        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    input_tensor = transform_image(pil_img).unsqueeze(0).to("cuda:0")
+    with torch.no_grad():
+        preds = rmbg(input_tensor)[-1].sigmoid().cpu()
+    pred = preds[0].squeeze()
+    pred_pil = transforms.ToPILImage()(pred)
+    mask = pred_pil.resize(pil_img.size)
+    mask = np.array(mask)
+    rgba = np.concatenate([np.array(pil_img), mask[..., None]], axis=-1)
+    return rgba
+def preprocess_image(image, use_rmbg):
+    """Step 1: process image (background removal or use original), return immediately.
+    use_rmbg=True: run BiRefNet to remove background and generate RGBA
+    use_rmbg=False: directly use the original image (RGB or RGBA), skip background removal
+    """
+    if image is None:
+        return None
+    if use_rmbg:
+        # Run background removal
+        if rmbg is None:
+            gr.Warning("Background removal model not loaded.")
+            return None
+        processed = remove_background(image)
+    else:
+        # Directly use original image, no background removal
+        processed = image
+    os.makedirs("./gradio_outputs", exist_ok=True)
+    Image.fromarray(processed).save("./gradio_outputs/processed.png")
+    return processed
+def infer_mesh(
+    processed,
+    use_fixed_camera,
+    camera_angle_x,
+    mesh_scale,
+    dense_steps,
+    dense_guidance_scale,
+    dense_seed,
+    sparse_512_steps,
+    sparse_512_guidance_scale,
+    sparse_1024_steps,
+    sparse_1024_guidance_scale,
+    sparse_seed,
+    dense_threshold,
+    mc_threshold,
+):
+    """Step 2: run 3D inference on the already-processed image."""
+    if processed is None or pipeline is None:
+        return None, None
+    tmp_input = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+    Image.fromarray(processed).save(tmp_input.name)
+    input_path = tmp_input.name
+    try:
+        if use_fixed_camera:
+            mesh = Pixal3DPipeline.infer(
+                pipeline,
+                image=input_path,
+                camera_angle_x=camera_angle_x,
+                mesh_scale=mesh_scale,
+                dense_steps=int(dense_steps),
+                dense_guidance_scale=dense_guidance_scale,
+                dense_seed=int(dense_seed),
+                sparse_512_steps=int(sparse_512_steps),
+                sparse_512_guidance_scale=sparse_512_guidance_scale,
+                sparse_1024_steps=int(sparse_1024_steps),
+                sparse_1024_guidance_scale=sparse_1024_guidance_scale,
+                sparse_seed=int(sparse_seed),
+                dense_threshold=dense_threshold,
+                mc_threshold=mc_threshold,
+            )
+        else:
+            mesh = pipeline.infer(
+                image=input_path,
+                mesh_scale=mesh_scale,
+                optimize_mesh_scale=True,
+                target_padding=3,
+                max_optim_iterations=2,
+                dense_steps=int(dense_steps),
+                dense_guidance_scale=dense_guidance_scale,
+                dense_seed=int(dense_seed),
+                sparse_512_steps=int(sparse_512_steps),
+                sparse_512_guidance_scale=sparse_512_guidance_scale,
+                sparse_1024_steps=int(sparse_1024_steps),
+                sparse_1024_guidance_scale=sparse_1024_guidance_scale,
+                sparse_seed=int(sparse_seed),
+                dense_threshold=dense_threshold,
+                mc_threshold=mc_threshold,
+            )
+        ply_file = tempfile.NamedTemporaryFile(suffix=".ply", delete=False)
+        glb_file = tempfile.NamedTemporaryFile(suffix=".glb", delete=False)
+        ply_path = ply_file.name
+        glb_path = glb_file.name
+        ply_file.close()
+        glb_file.close()
+        mesh.export(ply_path)
+        # Export GLB with PBR material (same as hunyuan_app)
+        material = PBRMaterial(baseColorFactor=[102, 102, 102, 255])
+        clean_mesh = trimesh.Trimesh(mesh.vertices, mesh.faces)
+        clean_mesh.visual = trimesh.visual.TextureVisuals(material=material)
+        # Rotate mesh to desired view angle (only X rotation needed)
+        rot_x = rotation_matrix(np.radians(-90), [1, 0, 0])
+        clean_mesh.apply_transform(rot_x)
+        clean_mesh.export(glb_path)
+        return glb_path, ply_path
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return None, None
+    finally:
+        os.unlink(input_path)
+def build_ui():
+    # Custom CSS to hide the download button in Model3D
+    custom_css = """
+    #model3d-viewer button[aria-label="下载"],
+    #model3d-viewer button[aria-label="Download"],
+    #model3d-viewer button[title="下载"],
+    #model3d-viewer button[title="Download"] {
+        display: none !important;
+    }
+    """
+    with gr.Blocks(title="Pixal3D", theme=gr.themes.Soft(), css=custom_css) as demo:
+        gr.Markdown("# Pixal3D: Pixel-Aligned 3D Generation from Images")
+        with gr.Row():
+            # Left column: input (scale=1)
+            with gr.Column(scale=1):
+                image_input = gr.Image(label="Input Image", type="numpy", image_mode=None)
+                processed_image = gr.Image(
+                    label="Processed Image",
+                    image_mode="RGBA",
+                    type="numpy",
+                    interactive=False,
+                )
+                use_rmbg = gr.Checkbox(
+                    label="Remove Background",
+                    value=True,
+                    info="Checked: auto remove background via BiRefNet. Unchecked: use original image directly.",
+                )
+                use_fixed_camera = gr.Checkbox(
+                    label="Use Fixed Camera Parameters",
+                    value=False,
+                    info="If checked, use manually set FOV/distance/mesh_scale instead of MoGe auto-estimation.",
+                )
+                with gr.Group(visible=False) as fixed_camera_group:
+                    gr.Markdown("### Camera Parameters (fixed mode)")
+                    camera_angle_x = gr.Number(value=0.2, label="camera_angle_x (rad)", step=0.01)
+                with gr.Group():
+                    gr.Markdown("### Mesh Scale")
+                    mesh_scale = gr.Number(value=0.5, label="mesh_scale", step=0.01,
+                                           info="Initial mesh scale. Fixed mode default: 0.9, Auto mode default: 0.5")
+                with gr.Accordion("Advanced Inference Parameters", open=False):
+                    dense_steps = gr.Number(value=50, label="Dense Steps", step=1, precision=0)
+                    dense_guidance_scale = gr.Number(value=7.0, label="Dense Guidance Scale", step=0.1)
+                    dense_seed = gr.Number(value=0, label="Dense Seed", step=1, precision=0)
+                    sparse_512_steps = gr.Number(value=30, label="Sparse 512 Steps", step=1, precision=0)
+                    sparse_512_guidance_scale = gr.Number(value=7.0, label="Sparse 512 Guidance Scale", step=0.1)
+                    sparse_1024_steps = gr.Number(value=15, label="Sparse 1024 Steps", step=1, precision=0)
+                    sparse_1024_guidance_scale = gr.Number(value=7.0, label="Sparse 1024 Guidance Scale", step=0.1)
+                    sparse_seed = gr.Number(value=0, label="Sparse Seed", step=1, precision=0)
+                    dense_threshold = gr.Number(value=0.1, label="Dense Threshold", step=0.01)
+                    mc_threshold = gr.Number(value=0.2, label="MC Threshold", step=0.01)
+                run_btn = gr.Button("Generate 3D Mesh", variant="primary", size="lg")
+            # Right column: output (scale=2)
+            with gr.Column(scale=2):
+                model_viewer = gr.Model3D(label="3D Mesh Preview", interactive=False, clear_color=[1.0, 1.0, 1.0, 1.0], elem_id="model3d-viewer")
+                output_file = gr.File(label="Download .ply")
+        # Toggle fixed camera group visibility and mesh_scale default
+        def on_toggle_fixed(use_fixed):
+            new_scale = 0.9 if use_fixed else 0.5
+            return gr.update(visible=use_fixed), gr.update(value=new_scale)
+        use_fixed_camera.change(
+            fn=on_toggle_fixed,
+            inputs=[use_fixed_camera],
+            outputs=[fixed_camera_group, mesh_scale],
+        )
+        # Step 1: preprocess image → show processed image immediately
+        # Step 2: run 3D inference → show mesh and download
+        run_btn.click(
+            fn=preprocess_image,
+            inputs=[image_input, use_rmbg],
+            outputs=[processed_image],
+        ).then(
+            fn=infer_mesh,
+            inputs=[
+                processed_image,
+                use_fixed_camera,
+                camera_angle_x,
+                mesh_scale,
+                dense_steps,
+                dense_guidance_scale,
+                dense_seed,
+                sparse_512_steps,
+                sparse_512_guidance_scale,
+                sparse_1024_steps,
+                sparse_1024_guidance_scale,
+                sparse_seed,
+                dense_threshold,
+                mc_threshold,
+            ],
+            outputs=[model_viewer, output_file],
+        )
+    demo.queue(api_open=False)
+    return demo
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--repo_id", type=str, default="TencentARC/Pixal3D-D")
+    args = parser.parse_args()
+    load_pipeline(repo_id=args.repo_id)
+    demo = build_ui()
+    demo.launch(
+        server_name="127.0.0.1",
+        share=True,
+    )

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ libsparsehash-dev

pixal3d/__init__.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import importlib
+__modules__ = {}
+def register(name):
+    def decorator(cls):
+        # Allow re-registration for checkpoint loading compatibility
+        # When torch.load triggers module re-import, the same class may be registered again
+        __modules__[name] = cls
+        return cls
+    return decorator
+def find(name):
+    if name in __modules__:
+        return __modules__[name]
+    else:
+        try:
+            module_string = ".".join(name.split(".")[:-1])
+            cls_name = name.split(".")[-1]
+            module = importlib.import_module(module_string, package=None)
+            return getattr(module, cls_name)
+        except Exception as e:
+            raise ValueError(f"Module {name} not found!")
+###  grammar sugar for logging utilities  ###
+import logging
+logger = logging.getLogger("pixal3d")
+def debug(*args, **kwargs):
+    logger.debug(*args, **kwargs)
+def info(*args, **kwargs):
+    logger.info(*args, **kwargs)
+def warn(*args, **kwargs):
+    logger.warning(*args, **kwargs)

pixal3d/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.25 kB). View file

pixal3d/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import conditional_encoders, transformers

pixal3d/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (241 Bytes). View file

pixal3d/models/autoencoders/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (4.39 kB). View file

pixal3d/models/autoencoders/__pycache__/decoder.cpython-310.pyc ADDED Viewed

Binary file (8.77 kB). View file

pixal3d/models/autoencoders/__pycache__/dense_vae.cpython-310.pyc ADDED Viewed

Binary file (11.7 kB). View file

pixal3d/models/autoencoders/__pycache__/distributions.cpython-310.pyc ADDED Viewed

Binary file (2.09 kB). View file

pixal3d/models/autoencoders/__pycache__/encoder.cpython-310.pyc ADDED Viewed

Binary file (3.77 kB). View file

pixal3d/models/autoencoders/__pycache__/ss_vae.cpython-310.pyc ADDED Viewed

Binary file (3.86 kB). View file

pixal3d/models/autoencoders/base.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from typing import *
+import torch
+import torch.nn as nn
+from ...modules.utils import convert_module_to_f16, convert_module_to_f32
+from ...modules import sparse as sp
+from ...modules.transformer import AbsolutePositionEmbedder
+from ...modules.sparse.transformer import SparseTransformerBlock
+def block_attn_config(self):
+    """
+    Return the attention configuration of the model.
+    """
+    for i in range(self.num_blocks):
+        if self.attn_mode == "shift_window":
+            yield "serialized", self.window_size, 0, (16 * (i % 2),) * 3, sp.SerializeMode.Z_ORDER
+        elif self.attn_mode == "shift_sequence":
+            yield "serialized", self.window_size, self.window_size // 2 * (i % 2), (0, 0, 0), sp.SerializeMode.Z_ORDER
+        elif self.attn_mode == "shift_order":
+            yield "serialized", self.window_size, 0, (0, 0, 0), sp.SerializeModes[i % 4]
+        elif self.attn_mode == "full":
+            yield "full", None, None, None, None
+        elif self.attn_mode == "swin":
+            yield "windowed", self.window_size, None, self.window_size // 2 * (i % 2), None
+class SparseTransformerBase(nn.Module):
+    """
+    Sparse Transformer without output layers.
+    Serve as the base class for encoder and decoder.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        model_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.num_blocks = num_blocks
+        self.window_size = window_size
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.attn_mode = attn_mode
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.qk_rms_norm = qk_rms_norm
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        if pe_mode == "ape":
+            self.pos_embedder = AbsolutePositionEmbedder(model_channels)
+        self.input_layer = sp.SparseLinear(in_channels, model_channels)
+        self.blocks = nn.ModuleList([
+            SparseTransformerBlock(
+                model_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode=attn_mode,
+                window_size=window_size,
+                shift_sequence=shift_sequence,
+                shift_window=shift_window,
+                serialize_mode=serialize_mode,
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                qk_rms_norm=self.qk_rms_norm,
+            )
+            for attn_mode, window_size, shift_sequence, shift_window, serialize_mode in block_attn_config(self)
+        ])
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        # self.blocks.apply(convert_module_to_f16)
+        self.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.blocks.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+    def forward(self, x: sp.SparseTensor, factor: float = None) -> sp.SparseTensor:
+        h = self.input_layer(x)
+        if self.pe_mode == "ape":
+            h = h + self.pos_embedder(x.coords[:, 1:], factor)
+        h = h.type(self.dtype)
+        for block in self.blocks:
+            h = block(h)
+        return h

pixal3d/models/autoencoders/decoder.py ADDED Viewed

	@@ -0,0 +1,353 @@

+from typing import *
+import random
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+from ...modules import sparse as sp
+from .base import SparseTransformerBase
+class SparseSubdivideBlock3d(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        out_channels: Optional[int] = None,
+        use_checkpoint: bool = False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_checkpoint = use_checkpoint
+        self.act_layers = nn.Sequential(
+            sp.SparseConv3d(channels, self.out_channels, 3, padding=1),
+            sp.SparseSiLU()
+        )
+        self.sub = sp.SparseSubdivide()
+        self.out_layers = nn.Sequential(
+            sp.SparseConv3d(self.out_channels, self.out_channels, 3, padding=1),
+            sp.SparseSiLU(),
+        )
+    def _forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        h = self.act_layers(x)
+        h = self.sub(h)
+        h = self.out_layers(h)
+        return h
+    def forward(self, x: torch.Tensor):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, use_reentrant=False)
+        else:
+            return self._forward(x)
+class SparseSDFDecoder(SparseTransformerBase):
+    def __init__(
+        self,
+        resolution: int,
+        model_channels: int,
+        latent_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
+        window_size: int = 8,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+        representation_config: dict = None,
+        out_channels: int = 1,
+        chunk_size: int = 1,
+    ):
+        super().__init__(
+            in_channels=latent_channels,
+            model_channels=model_channels,
+            num_blocks=num_blocks,
+            num_heads=num_heads,
+            num_head_channels=num_head_channels,
+            mlp_ratio=mlp_ratio,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            pe_mode=pe_mode,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.resolution = resolution
+        self.rep_config = representation_config
+        self.out_channels = out_channels
+        self.chunk_size = chunk_size
+        self.upsample = nn.ModuleList([
+            SparseSubdivideBlock3d(
+                channels=model_channels,
+                out_channels=model_channels // 4,
+                use_checkpoint=use_checkpoint,
+            ),
+            SparseSubdivideBlock3d(
+                channels=model_channels // 4,
+                out_channels=model_channels // 8,
+                use_checkpoint=use_checkpoint,
+            ),
+            SparseSubdivideBlock3d(
+                channels=model_channels // 8,
+                out_channels=model_channels // 16,
+                use_checkpoint=use_checkpoint,
+            )
+        ])
+        self.out_layer = sp.SparseLinear(model_channels // 16, self.out_channels)
+        self.out_active = sp.SparseTanh()
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        super().convert_to_fp16()
+        self.upsample.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        super().convert_to_fp32()
+        self.upsample.apply(convert_module_to_f32)
+    @torch.no_grad()
+    def split_for_meshing(self, x: sp.SparseTensor, chunk_size=4, padding=4):
+        sub_resolution = self.resolution // chunk_size
+        upsample_ratio = 8 # hard-coded here
+        assert sub_resolution % padding == 0
+        out = []
+        for i in range(chunk_size):
+            for j in range(chunk_size):
+                for k in range(chunk_size):
+                    # Calculate padded boundaries
+                    start_x = max(0, i * sub_resolution - padding)
+                    end_x = min((i + 1) * sub_resolution + padding, self.resolution)
+                    start_y = max(0, j * sub_resolution - padding)
+                    end_y = min((j + 1) * sub_resolution + padding, self.resolution)
+                    start_z = max(0, k * sub_resolution - padding)
+                    end_z = min((k + 1) * sub_resolution + padding, self.resolution)
+                    # Store original (unpadded) boundaries for later cropping
+                    orig_start_x = i * sub_resolution
+                    orig_end_x = (i + 1) * sub_resolution
+                    orig_start_y = j * sub_resolution
+                    orig_end_y = (j + 1) * sub_resolution
+                    orig_start_z = k * sub_resolution
+                    orig_end_z = (k + 1) * sub_resolution
+                    mask = torch.logical_and(
+                        torch.logical_and(
+                            torch.logical_and(x.coords[:, 1] >= start_x, x.coords[:, 1] < end_x),
+                            torch.logical_and(x.coords[:, 2] >= start_y, x.coords[:, 2] < end_y)
+                        ),
+                        torch.logical_and(x.coords[:, 3] >= start_z, x.coords[:, 3] < end_z)
+                    )
+                    if mask.sum() > 0:
+                        # Get the coordinates and shift them to local space
+                        coords = x.coords[mask].clone()
+                        # Shift to local coordinates
+                        coords[:, 1:] = coords[:, 1:] - torch.tensor([start_x, start_y, start_z],
+                                                                    device=coords.device).view(1, 3)
+                        chunk_tensor = sp.SparseTensor(x.feats[mask], coords)
+                        # Store the boundaries and offsets as metadata for later reconstruction
+                        chunk_tensor.bounds = {
+                            'original': (orig_start_x * upsample_ratio, orig_end_x * upsample_ratio + (upsample_ratio - 1), orig_start_y * upsample_ratio, orig_end_y * upsample_ratio + (upsample_ratio - 1), orig_start_z * upsample_ratio, orig_end_z * upsample_ratio + (upsample_ratio - 1)),
+                            'offsets': (start_x * upsample_ratio, start_y * upsample_ratio, start_z * upsample_ratio)  # Store offsets for reconstruction
+                        }
+                        out.append(chunk_tensor)
+                    del mask
+                    torch.cuda.empty_cache()
+        return out
+    @torch.no_grad()
+    def split_single_chunk(self, x: sp.SparseTensor, chunk_size=4, padding=4):
+        sub_resolution = self.resolution // chunk_size
+        upsample_ratio = 8 # hard-coded here
+        assert sub_resolution % padding == 0
+        mask_sum = -1
+        while mask_sum < 1:
+            orig_start_x = random.randint(0, self.resolution - sub_resolution)
+            orig_end_x = orig_start_x + sub_resolution
+            orig_start_y = random.randint(0, self.resolution - sub_resolution)
+            orig_end_y = orig_start_y + sub_resolution
+            orig_start_z = random.randint(0, self.resolution - sub_resolution)
+            orig_end_z = orig_start_z + sub_resolution
+            start_x = max(0, orig_start_x - padding)
+            end_x = min(orig_end_x + padding, self.resolution)
+            start_y = max(0, orig_start_y - padding)
+            end_y = min(orig_end_y + padding, self.resolution)
+            start_z = max(0, orig_start_z - padding)
+            end_z = min(orig_end_z + padding, self.resolution)
+            mask_ori = torch.logical_and(
+                torch.logical_and(
+                    torch.logical_and(x.coords[:, 1] >= orig_start_x, x.coords[:, 1] < orig_end_x),
+                    torch.logical_and(x.coords[:, 2] >= orig_start_y, x.coords[:, 2] < orig_end_y)
+                ),
+                torch.logical_and(x.coords[:, 3] >= orig_start_z, x.coords[:, 3] < orig_end_z)
+            )
+            mask_sum = mask_ori.sum()
+        # Store the boundaries and offsets as metadata for later reconstruction
+        bounds = {
+            'original': (orig_start_x * upsample_ratio, orig_end_x * upsample_ratio + (upsample_ratio - 1), orig_start_y * upsample_ratio, orig_end_y * upsample_ratio + (upsample_ratio - 1), orig_start_z * upsample_ratio, orig_end_z * upsample_ratio + (upsample_ratio - 1)),
+            'start': (start_x, end_x, start_y, end_y, start_z, end_z),
+            'offsets': (start_x * upsample_ratio, start_y * upsample_ratio, start_z * upsample_ratio)  # Store offsets for reconstruction
+        }
+        return bounds
+    def forward_single_chunk(self, x: sp.SparseTensor, padding=4):
+        bounds = self.split_single_chunk(x, self.chunk_size, padding=padding)
+        start_x, end_x, start_y, end_y, start_z, end_z = bounds['start']
+        mask = torch.logical_and(
+            torch.logical_and(
+                torch.logical_and(x.coords[:, 1] >= start_x, x.coords[:, 1] < end_x),
+                torch.logical_and(x.coords[:, 2] >= start_y, x.coords[:, 2] < end_y)
+            ),
+            torch.logical_and(x.coords[:, 3] >= start_z, x.coords[:, 3] < end_z)
+        )
+        # Shift to local coordinates
+        coords = x.coords.clone()
+        coords[:, 1:] = coords[:, 1:] - torch.tensor([start_x, start_y, start_z],
+                                                    device=coords.device).view(1, 3)
+        chunk = sp.SparseTensor(x.feats[mask], coords[mask])
+        chunk_result = self.upsamples(chunk)
+        coords = chunk_result.coords.clone()
+        # Restore global coordinates
+        offsets = torch.tensor(bounds['offsets'],
+                                device=coords.device).view(1, 3)
+        coords[:, 1:] = coords[:, 1:] + offsets
+        # Filter points within original bounds
+        original = bounds['original']
+        within_bounds = torch.logical_and(
+            torch.logical_and(
+                torch.logical_and(
+                    coords[:, 1] >= original[0],
+                    coords[:, 1] < original[1]
+                ),
+                torch.logical_and(
+                    coords[:, 2] >= original[2],
+                    coords[:, 2] < original[3]
+                )
+            ),
+            torch.logical_and(
+                coords[:, 3] >= original[4],
+                coords[:, 3] < original[5]
+            )
+        )
+        final_coords = coords[within_bounds]
+        final_feats = chunk_result.feats[within_bounds]
+        return sp.SparseTensor(final_feats, final_coords)
+    def upsamples(self, x, return_feat: bool = False):
+        dtype = x.dtype
+        for block in self.upsample:
+            x = block(x)
+        x = x.type(dtype)
+        output = self.out_active(self.out_layer(x))
+        if return_feat:
+            return output, x
+        else:
+            return output
+    def forward(self, x: sp.SparseTensor, factor: float = None, return_feat: bool = False):
+        h = super().forward(x, factor)
+        if self.chunk_size <= 1:
+            for block in self.upsample:
+                h = block(h)
+            h = h.type(x.dtype)
+            if return_feat:
+                return self.out_active(self.out_layer(h)), h
+            h = self.out_layer(h)
+            h = self.out_active(h)
+            return h
+        else:
+            if self.training:
+                return self.forward_single_chunk(h)
+            else:
+                batch_size = x.shape[0]
+                chunks = self.split_for_meshing(h, chunk_size=self.chunk_size)
+                all_coords, all_feats = [], []
+                for chunk_idx, chunk in enumerate(chunks):
+                    chunk_result = self.upsamples(chunk)
+                    for b in range(batch_size):
+                        mask = torch.nonzero(chunk_result.coords[:, 0] == b).squeeze(-1)
+                        if mask.numel() > 0:
+                            coords = chunk_result.coords[mask].clone()
+                            # Restore global coordinates
+                            offsets = torch.tensor(chunk.bounds['offsets'],
+                                                    device=coords.device).view(1, 3)
+                            coords[:, 1:] = coords[:, 1:] + offsets
+                            # Filter points within original bounds
+                            bounds = chunk.bounds['original']
+                            within_bounds = torch.logical_and(
+                                torch.logical_and(
+                                    torch.logical_and(
+                                        coords[:, 1] >= bounds[0],
+                                        coords[:, 1] < bounds[1]
+                                    ),
+                                    torch.logical_and(
+                                        coords[:, 2] >= bounds[2],
+                                        coords[:, 2] < bounds[3]
+                                    )
+                                ),
+                                torch.logical_and(
+                                    coords[:, 3] >= bounds[4],
+                                    coords[:, 3] < bounds[5]
+                                )
+                            )
+                            if within_bounds.any():
+                                all_coords.append(coords[within_bounds])
+                                all_feats.append(chunk_result.feats[mask][within_bounds])
+                    if not self.training:
+                        torch.cuda.empty_cache()
+                final_coords = torch.cat(all_coords)
+                final_feats = torch.cat(all_feats)
+                return sp.SparseTensor(final_feats, final_coords)

pixal3d/models/autoencoders/dense_vae.py ADDED Viewed

	@@ -0,0 +1,401 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import trimesh
+from skimage import measure
+from ...modules.norm import GroupNorm32, ChannelLayerNorm32
+from ...modules.spatial import pixel_shuffle_3d
+from ...modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+from .distributions import DiagonalGaussianDistribution
+def norm_layer(norm_type: str, *args, **kwargs) -> nn.Module:
+    """
+    Return a normalization layer.
+    """
+    if norm_type == "group":
+        return GroupNorm32(32, *args, **kwargs)
+    elif norm_type == "layer":
+        return ChannelLayerNorm32(*args, **kwargs)
+    else:
+        raise ValueError(f"Invalid norm type {norm_type}")
+class ResBlock3d(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        out_channels: Optional[int] = None,
+        norm_type: Literal["group", "layer"] = "layer",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.norm1 = norm_layer(norm_type, channels)
+        self.norm2 = norm_layer(norm_type, self.out_channels)
+        self.conv1 = nn.Conv3d(channels, self.out_channels, 3, padding=1)
+        self.conv2 = zero_module(nn.Conv3d(self.out_channels, self.out_channels, 3, padding=1))
+        self.skip_connection = nn.Conv3d(channels, self.out_channels, 1) if channels != self.out_channels else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.norm1(x)
+        h = F.silu(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = F.silu(h)
+        h = self.conv2(h)
+        h = h + self.skip_connection(x)
+        return h
+class DownsampleBlock3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        mode: Literal["conv", "avgpool"] = "conv",
+    ):
+        assert mode in ["conv", "avgpool"], f"Invalid mode {mode}"
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if mode == "conv":
+            self.conv = nn.Conv3d(in_channels, out_channels, 2, stride=2)
+        elif mode == "avgpool":
+            assert in_channels == out_channels, "Pooling mode requires in_channels to be equal to out_channels"
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, "conv"):
+            return self.conv(x)
+        else:
+            return F.avg_pool3d(x, 2)
+class UpsampleBlock3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        mode: Literal["conv", "nearest"] = "conv",
+    ):
+        assert mode in ["conv", "nearest"], f"Invalid mode {mode}"
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if mode == "conv":
+            self.conv = nn.Conv3d(in_channels, out_channels*8, 3, padding=1)
+        elif mode == "nearest":
+            assert in_channels == out_channels, "Nearest mode requires in_channels to be equal to out_channels"
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, "conv"):
+            x = self.conv(x)
+            return pixel_shuffle_3d(x, 2)
+        else:
+            return F.interpolate(x, scale_factor=2, mode="nearest")
+class SparseStructureEncoder(nn.Module):
+    """
+    Encoder for Sparse Structure (\mathcal{E}_S in the paper Sec. 3.3).
+    Args:
+        in_channels (int): Channels of the input.
+        latent_channels (int): Channels of the latent representation.
+        num_res_blocks (int): Number of residual blocks at each resolution.
+        channels (List[int]): Channels of the encoder blocks.
+        num_res_blocks_middle (int): Number of residual blocks in the middle.
+        norm_type (Literal["group", "layer"]): Type of normalization layer.
+        use_fp16 (bool): Whether to use FP16.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        latent_channels: int,
+        num_res_blocks: int,
+        channels: List[int],
+        num_res_blocks_middle: int = 2,
+        norm_type: Literal["group", "layer"] = "layer",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.latent_channels = latent_channels
+        self.num_res_blocks = num_res_blocks
+        self.channels = channels
+        self.num_res_blocks_middle = num_res_blocks_middle
+        self.norm_type = norm_type
+        self.use_fp16 = use_fp16
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.use_checkpoint = use_checkpoint
+        self.input_layer = nn.Conv3d(in_channels, channels[0], 3, padding=1)
+        self.blocks = nn.ModuleList([])
+        for i, ch in enumerate(channels):
+            self.blocks.extend([
+                ResBlock3d(ch, ch)
+                for _ in range(num_res_blocks)
+            ])
+            if i < len(channels) - 1:
+                self.blocks.append(
+                    DownsampleBlock3d(ch, channels[i+1])
+                )
+        self.middle_block = nn.Sequential(*[
+            ResBlock3d(channels[-1], channels[-1])
+            for _ in range(num_res_blocks_middle)
+        ])
+        self.out_layer = nn.Sequential(
+            norm_layer(norm_type, channels[-1]),
+            nn.SiLU(),
+            nn.Conv3d(channels[-1], latent_channels*2, 3, padding=1)
+        )
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.use_fp16 = True
+        self.dtype = torch.float16
+        self.blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.use_fp16 = False
+        self.dtype = torch.float32
+        self.blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.input_layer(x)
+        for block in self.blocks:
+            h = block(h)
+        h = self.middle_block(h)
+        h = self.out_layer(h)
+        return h
+class SparseStructureDecoder(nn.Module):
+    """
+    Decoder for Sparse Structure (\mathcal{D}_S in the paper Sec. 3.3).
+    Args:
+        out_channels (int): Channels of the output.
+        latent_channels (int): Channels of the latent representation.
+        num_res_blocks (int): Number of residual blocks at each resolution.
+        channels (List[int]): Channels of the decoder blocks.
+        num_res_blocks_middle (int): Number of residual blocks in the middle.
+        norm_type (Literal["group", "layer"]): Type of normalization layer.
+        use_fp16 (bool): Whether to use FP16.
+    """
+    def __init__(
+        self,
+        out_channels: int,
+        latent_channels: int,
+        num_res_blocks: int,
+        channels: List[int],
+        num_res_blocks_middle: int = 2,
+        norm_type: Literal["group", "layer"] = "layer",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.latent_channels = latent_channels
+        self.num_res_blocks = num_res_blocks
+        self.channels = channels
+        self.num_res_blocks_middle = num_res_blocks_middle
+        self.norm_type = norm_type
+        self.use_fp16 = use_fp16
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.use_checkpoint = use_checkpoint
+        self.input_layer = nn.Conv3d(latent_channels, channels[0], 3, padding=1)
+        self.middle_block = nn.Sequential(*[
+            ResBlock3d(channels[0], channels[0])
+            for _ in range(num_res_blocks_middle)
+        ])
+        self.blocks = nn.ModuleList([])
+        for i, ch in enumerate(channels):
+            self.blocks.extend([
+                ResBlock3d(ch, ch)
+                for _ in range(num_res_blocks)
+            ])
+            if i < len(channels) - 1:
+                self.blocks.append(
+                    UpsampleBlock3d(ch, channels[i+1])
+                )
+        self.out_layer = nn.Sequential(
+            norm_layer(norm_type, channels[-1]),
+            nn.SiLU(),
+            nn.Conv3d(channels[-1], out_channels, 3, padding=1)
+        )
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.use_fp16 = True
+        self.dtype = torch.float16
+        # self.blocks.apply(convert_module_to_f16)
+        # self.middle_block.apply(convert_module_to_f16)
+        self.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.use_fp16 = False
+        self.dtype = torch.float32
+        self.blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.input_layer(x)
+        h = self.middle_block(h)
+        for block in self.blocks:
+            h = block(h)
+        h = self.out_layer(h)
+        return h
+class DenseShapeVAE(nn.Module):
+    def __init__(self,
+                 embed_dim: int = 0,
+                 model_channels_encoder: list = [32, 128, 512],
+                 model_channels_decoder: list = [512, 128, 32],
+                 num_res_blocks_encoder: int = 2,
+                 num_res_blocks_middle_encoder: int = 2,
+                 num_res_blocks_decoder: int = 2,
+                 num_res_blocks_middle_decoder: int=2,
+                 in_channels: int = 1,
+                 out_channels: int = 1,
+                 use_fp16: bool = False,
+                 use_checkpoint: bool = False,
+                 latents_scale: float = 1.0,
+                 latents_shift: float = 0.0):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.latents_scale = latents_scale
+        self.latents_shift = latents_shift
+        self.encoder = SparseStructureEncoder(
+            in_channels=in_channels,
+            latent_channels=embed_dim,
+            num_res_blocks=num_res_blocks_encoder,
+            channels=model_channels_encoder,
+            num_res_blocks_middle=num_res_blocks_middle_encoder,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+        )
+        self.decoder = SparseStructureDecoder(
+            num_res_blocks=num_res_blocks_decoder,
+            num_res_blocks_middle=num_res_blocks_middle_decoder,
+            channels=model_channels_decoder,
+            latent_channels=embed_dim,
+            out_channels=out_channels,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+        )
+        self.embed_dim = embed_dim
+    def encode(self, batch, sample_posterior: bool = True):
+        x = batch['dense_index'] * 2.0 - 1.0
+        h = self.encoder(x)
+        posterior = DiagonalGaussianDistribution(h, feat_dim=1)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        return z, posterior
+    def forward(self, batch):
+        z, posterior = self.encode(batch)
+        reconst_x = self.decoder(z)
+        outputs = {'reconst_x': reconst_x, 'posterior': posterior}
+        return outputs
+    def decode_mesh(self,
+                    latents,
+                    voxel_resolution: int = 64,
+                    mc_threshold: float = 0.5,
+                    return_index: bool = False):
+        x = self.decoder(latents)
+        if return_index:
+            outputs = []
+            for i in range(len(x)):
+                occ = x[i].sigmoid()
+                occ = (occ >= mc_threshold).float().squeeze(0)
+                index = occ.unsqueeze(0).nonzero()
+                outputs.append(index)
+        else:
+            outputs = self.dense2mesh(x, voxel_resolution=voxel_resolution, mc_threshold=mc_threshold)
+        return outputs
+    def dense2mesh(self,
+                    x: torch.FloatTensor,
+                    voxel_resolution: int = 64,
+                    mc_threshold: float = 0.5):
+        meshes = []
+        for i in range(len(x)):
+            occ = x[i].sigmoid()
+            occ = (occ >= 0.1).float().squeeze(0).cpu().detach().numpy()
+            vertices, faces, _, _ = measure.marching_cubes(
+                occ,
+                mc_threshold,
+                method="lewiner",
+            )
+            vertices = vertices / voxel_resolution * 2 - 1
+            meshes.append(trimesh.Trimesh(vertices, faces))
+        return meshes

pixal3d/models/autoencoders/distributions.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+import numpy as np
+from typing import Union, List
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters: Union[torch.Tensor, List[torch.Tensor]], deterministic=False, feat_dim=1):
+        self.feat_dim = feat_dim
+        self.parameters = parameters
+        if isinstance(parameters, list):
+            self.mean = parameters[0]
+            self.logvar = parameters[1]
+        else:
+            self.mean, self.logvar = torch.chunk(parameters, 2, dim=feat_dim)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean)
+    def sample(self):
+        x = self.mean + self.std * torch.randn_like(self.mean)
+        return x
+    def kl(self, other=None, dims=(1, 2, 3)):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        else:
+            if other is None:
+                return 0.5 * torch.mean(torch.pow(self.mean, 2)
+                                        + self.var - 1.0 - self.logvar,
+                                        dim=dims)
+            else:
+                return 0.5 * torch.mean(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
+                    dim=dims)
+    def nll(self, sample, dims=(1, 2, 3)):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims)
+    def mode(self):
+        return self.mean

pixal3d/models/autoencoders/encoder.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...modules import sparse as sp
+from .base import SparseTransformerBase
+class SparseDownBlock3d(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        out_channels: Optional[int] = None,
+        num_groups: int = 32,
+        use_checkpoint: bool = False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.act_layers = nn.Sequential(
+            sp.SparseGroupNorm32(num_groups, channels),
+            sp.SparseSiLU()
+        )
+        self.down = sp.SparseDownsample(2)
+        self.out_layers = nn.Sequential(
+            sp.SparseConv3d(channels, self.out_channels, 3, padding=1),
+            sp.SparseGroupNorm32(num_groups, self.out_channels),
+            sp.SparseSiLU(),
+            sp.SparseConv3d(self.out_channels, self.out_channels, 3, padding=1),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        else:
+            self.skip_connection = sp.SparseConv3d(channels, self.out_channels, 1)
+        self.use_checkpoint = use_checkpoint
+    def _forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        h = self.act_layers(x)
+        h = self.down(h)
+        x = self.down(x)
+        h = self.out_layers(h)
+        h = h + self.skip_connection(x)
+        return h
+    def forward(self, x: torch.Tensor):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, use_reentrant=False)
+        else:
+            return self._forward(x)
+class SparseSDFEncoder(SparseTransformerBase):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        model_channels: int,
+        latent_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
+        window_size: int = 8,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__(
+            in_channels=in_channels,
+            model_channels=model_channels,
+            num_blocks=num_blocks,
+            num_heads=num_heads,
+            num_head_channels=num_head_channels,
+            mlp_ratio=mlp_ratio,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            pe_mode=pe_mode,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.input_layer1 = sp.SparseLinear(1, model_channels // 16)
+        self.downsample = nn.ModuleList([
+            SparseDownBlock3d(
+                channels=model_channels//16,
+                out_channels=model_channels // 8,
+                use_checkpoint=use_checkpoint,
+            ),
+            SparseDownBlock3d(
+                channels=model_channels // 8,
+                out_channels=model_channels // 4,
+                use_checkpoint=use_checkpoint,
+            ),
+            SparseDownBlock3d(
+                channels=model_channels // 4,
+                out_channels=model_channels,
+                use_checkpoint=use_checkpoint,
+            )
+        ])
+        self.resolution = resolution
+        self.out_layer = sp.SparseLinear(model_channels, 2 * latent_channels)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def forward(self, x: sp.SparseTensor, factor: float = None):
+        x = self.input_layer1(x)
+        for block in self.downsample:
+            x = block(x)
+        h = super().forward(x, factor)
+        h = h.type(x.dtype)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = self.out_layer(h)
+        return h

pixal3d/models/autoencoders/ss_vae.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import trimesh
+from skimage import measure
+from ...modules import sparse as sp
+from .encoder import SparseSDFEncoder
+from .decoder import SparseSDFDecoder
+from .distributions import DiagonalGaussianDistribution
+class SparseSDFVAE(nn.Module):
+    def __init__(self, *,
+                 embed_dim: int = 0,
+                 resolution: int = 64,
+                 model_channels_encoder: int = 512,
+                 num_blocks_encoder: int = 4,
+                 num_heads_encoder: int = 8,
+                 num_head_channels_encoder: int = 64,
+                 model_channels_decoder: int = 512,
+                 num_blocks_decoder: int = 4,
+                 num_heads_decoder: int = 8,
+                 num_head_channels_decoder: int = 64,
+                 out_channels: int = 1,
+                 use_fp16: bool = False,
+                 use_checkpoint: bool = False,
+                 chunk_size: int = 1,
+                 latents_scale: float = 1.0,
+                 latents_shift: float = 0.0):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.resolution = resolution
+        self.latents_scale = latents_scale
+        self.latents_shift = latents_shift
+        self.encoder = SparseSDFEncoder(
+            resolution=resolution,
+            in_channels=model_channels_encoder,
+            model_channels=model_channels_encoder,
+            latent_channels=embed_dim,
+            num_blocks=num_blocks_encoder,
+            num_heads=num_heads_encoder,
+            num_head_channels=num_head_channels_encoder,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+        )
+        self.decoder = SparseSDFDecoder(
+            resolution=resolution,
+            model_channels=model_channels_decoder,
+            latent_channels=embed_dim,
+            num_blocks=num_blocks_decoder,
+            num_heads=num_heads_decoder,
+            num_head_channels=num_head_channels_decoder,
+            out_channels=out_channels,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            chunk_size=chunk_size,
+        )
+        self.embed_dim = embed_dim
+    def forward(self, batch):
+        z, posterior = self.encode(batch)
+        reconst_x = self.decoder(z)
+        outputs = {'reconst_x': reconst_x, 'posterior': posterior}
+        return outputs
+    def encode(self, batch, sample_posterior: bool = True):
+        feat, xyz, batch_idx = batch['sparse_sdf'], batch['sparse_index'], batch['batch_idx']
+        if feat.ndim == 1:
+            feat = feat.unsqueeze(-1)
+        coords = torch.cat([batch_idx.unsqueeze(-1), xyz], dim=-1).int()
+        x = sp.SparseTensor(feat, coords)
+        h = self.encoder(x, batch.get('factor', None))
+        posterior = DiagonalGaussianDistribution(h.feats, feat_dim=1)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        z = h.replace(z)
+        return z, posterior
+    def decode_mesh(self,
+                    latents,
+                    voxel_resolution: int = 512,
+                    mc_threshold: float = 0.2,
+                    return_feat: bool = False,
+                    factor: float = 1.0):
+        voxel_resolution = int(voxel_resolution / factor)
+        reconst_x = self.decoder(latents, factor=factor, return_feat=return_feat)
+        if return_feat:
+            return reconst_x
+        outputs = self.sparse2mesh(reconst_x, voxel_resolution=voxel_resolution, mc_threshold=mc_threshold)
+        return outputs
+    def sparse2mesh(self,
+                    reconst_x: torch.FloatTensor,
+                    voxel_resolution: int = 512,
+                    mc_threshold: float = 0.0):
+        sparse_sdf, sparse_index = reconst_x.feats.float(), reconst_x.coords
+        batch_size = int(sparse_index[..., 0].max().cpu().numpy() + 1)
+        meshes = []
+        for i in range(batch_size):
+            idx = sparse_index[..., 0] == i
+            sparse_sdf_i, sparse_index_i = sparse_sdf[idx].squeeze(-1).cpu(),  sparse_index[idx][..., 1:].detach().cpu()
+            sdf = torch.ones((voxel_resolution, voxel_resolution, voxel_resolution))
+            sdf[sparse_index_i[..., 0], sparse_index_i[..., 1], sparse_index_i[..., 2]] = sparse_sdf_i
+            vertices, faces, _, _ = measure.marching_cubes(
+                sdf.numpy(),
+                mc_threshold,
+                method="lewiner",
+            )
+            vertices = vertices / voxel_resolution * 2 - 1
+            meshes.append(trimesh.Trimesh(vertices, faces))
+        return meshes

pixal3d/models/conditional_encoders/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from . import dinov2_project_grid
2	+

pixal3d/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (238 Bytes). View file

pixal3d/models/conditional_encoders/__pycache__/dinov2_project_grid.cpython-310.pyc ADDED Viewed

Binary file (16 kB). View file

pixal3d/models/conditional_encoders/dinov2_project_grid.py ADDED Viewed

	@@ -0,0 +1,750 @@

+"""
+DINOv2 Project Grid Encoders
+Includes single-view and multi-view DINOv2 encoders with 3D grid projection support
+"""
+import random
+from dataclasses import dataclass
+from typing import List, Dict, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+from diffusers.models.modeling_utils import ModelMixin
+import pixal3d
+from pixal3d.utils.base import BaseModule
+# Set linear algebra backend to avoid cusolver errors
+try:
+    torch.backends.cuda.preferred_linalg_library("cusolver")
+except Exception:
+    pass
+# =============================================================================
+# Base DINOv2 Encoder
+# =============================================================================
+@pixal3d.register("dinov2-encoder")
+class DinoEncoder(BaseModule, ModelMixin):
+    """Base DINOv2 Encoder"""
+    @dataclass
+    class Config(BaseModule.Config):
+        model: str = "facebookresearch/dinov2"
+        version: str = "dinov2_vitl14_reg"
+        size: int = 518
+        empty_embeds_ratio: float = 0.1
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+        self.empty_embeds_ratio = self.cfg.empty_embeds_ratio
+        # Load DINOv2 model
+        dino_model = torch.hub.load(
+            self.cfg.model, self.cfg.version, pretrained=True
+        )
+        self.encoder = dino_model.eval()
+        # Image preprocessing
+        self.transform = transforms.Compose([
+            transforms.Resize(
+                self.cfg.size,
+                transforms.InterpolationMode.BILINEAR,
+                antialias=True
+            ),
+            transforms.CenterCrop(self.cfg.size),
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225],
+            ),
+        ])
+    def forward(self, image, image_mask=None, is_training=False):
+        z = self.encoder(self.transform(image), is_training=True)['x_prenorm']
+        z = F.layer_norm(z, z.shape[-1:])
+        if is_training and random.random() < self.empty_embeds_ratio:
+            # zero out embeddings
+            z = z * 0
+        if image_mask is not None:
+            image_mask_patch = F.max_pool2d(
+                image_mask, kernel_size=14, stride=14
+            ).squeeze(1) > 0
+            return z, image_mask_patch
+        return z
+# =============================================================================
+# 3D Projection Utility Functions
+# =============================================================================
+def project_points_to_image_batch(
+    points_3d: torch.Tensor,
+    transform_matrix: torch.Tensor,
+    camera_angle_x: torch.Tensor,
+    resolution: int = 518
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Project 3D points to 2D image coordinates with batch support
+    Args:
+        points_3d: [N, 3] or [B, N, 3], 3D point coordinates (in [-1, 1] range)
+        transform_matrix: [B, 4, 4], batch of camera transformation matrices
+        camera_angle_x: [B], batch of camera horizontal FOV angles (radians)
+        resolution: Rendering image resolution
+    Returns:
+        points_2d: [B, N, 2], image coordinates [x, y]
+        depth: [B, N], depth values
+        valid_mask: [B, N], mask indicating if points are within view
+    """
+    device = points_3d.device
+    B = transform_matrix.shape[0]
+    # Ensure inputs are torch.Tensor
+    if not isinstance(transform_matrix, torch.Tensor):
+        transform_matrix = torch.tensor(
+            transform_matrix, dtype=torch.float32, device=device
+        )
+    if not isinstance(points_3d, torch.Tensor):
+        points_3d = torch.tensor(
+            points_3d, dtype=torch.float32, device=device
+        )
+    if not isinstance(camera_angle_x, torch.Tensor):
+        camera_angle_x = torch.tensor(
+            camera_angle_x, dtype=torch.float32, device=device
+        )
+    # Expand points_3d to batch dimension
+    if points_3d.dim() == 2:
+        points_3d_batch = points_3d.unsqueeze(0).expand(B, -1, -1)
+    else:
+        points_3d_batch = points_3d
+    N = points_3d_batch.shape[1]
+    # Add homogeneous coordinates
+    ones = torch.ones(B, N, 1, device=device)
+    points_homogeneous = torch.cat([points_3d_batch, ones], dim=-1)
+    # World to camera transformation
+    world_to_camera = torch.linalg.inv(transform_matrix)
+    points_camera = torch.bmm(
+        points_homogeneous,
+        world_to_camera.transpose(-2, -1)
+    )[..., :3]
+    # Extract camera coordinates
+    x_cam = points_camera[..., 0]
+    y_cam = points_camera[..., 1]
+    z_cam = points_camera[..., 2]
+    # Depth values
+    depth = -z_cam
+    # Compute camera intrinsics
+    sensor_width = 32.0
+    focal_length = 16.0 / torch.tan(camera_angle_x / 2.0)
+    focal_length_pixels = focal_length * resolution / sensor_width
+    focal_length_pixels = focal_length_pixels.unsqueeze(1)
+    # Perspective projection
+    x_ndc = focal_length_pixels * x_cam / (-z_cam)
+    y_ndc = focal_length_pixels * y_cam / (-z_cam)
+    # Convert to image coordinates
+    x_pixel = x_ndc + resolution / 2.0
+    y_pixel = -y_ndc + resolution / 2.0
+    # Validity mask
+    valid_mask = (
+        (x_pixel >= 0) & (x_pixel < resolution) &
+        (y_pixel >= 0) & (y_pixel < resolution) &
+        (depth > 0)
+    )
+    points_2d = torch.stack([x_pixel, y_pixel], dim=-1)
+    return points_2d, depth, valid_mask
+def project_points_to_image(
+    points_3d: torch.Tensor,
+    transform_matrix: torch.Tensor,
+    camera_angle_x: float,
+    resolution: int = 512
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Project 3D points to 2D image coordinates (single-view version)
+    Args:
+        points_3d: [N, 3], 3D point coordinates
+        transform_matrix: [4, 4], camera transformation matrix
+        camera_angle_x: Camera horizontal FOV angle (radians)
+        resolution: Rendering image resolution
+    Returns:
+        points_2d: [N, 2], image coordinates [x, y]
+        depth: [N], depth values
+        valid_mask: [N], mask indicating if points are within view
+    """
+    device = points_3d.device
+    if not isinstance(transform_matrix, torch.Tensor):
+        transform_matrix = torch.tensor(
+            transform_matrix, dtype=torch.float32, device=device
+        )
+    if not isinstance(points_3d, torch.Tensor):
+        points_3d = torch.tensor(
+            points_3d, dtype=torch.float32, device=device
+        )
+    N = points_3d.shape[0]
+    points_homogeneous = torch.cat([
+        points_3d,
+        torch.ones(N, 1, device=device)
+    ], dim=1)
+    # World to camera transformation
+    camera_to_world = transform_matrix
+    world_to_camera = torch.linalg.inv(camera_to_world)
+    points_camera = torch.matmul(
+        points_homogeneous,
+        world_to_camera.T
+    )[:, :3]
+    x_cam = points_camera[:, 0]
+    y_cam = points_camera[:, 1]
+    z_cam = points_camera[:, 2]
+    depth = -z_cam
+    # Camera intrinsics
+    sensor_width = 32.0
+    focal_length = 16.0 / torch.tan(torch.tensor(camera_angle_x / 2.0))
+    focal_length_pixels = focal_length * resolution / sensor_width
+    # Perspective projection
+    x_ndc = focal_length_pixels * x_cam / (-z_cam)
+    y_ndc = focal_length_pixels * y_cam / (-z_cam)
+    # Image coordinates
+    x_pixel = x_ndc + resolution / 2.0
+    y_pixel = -y_ndc + resolution / 2.0
+    valid_mask = (
+        (x_pixel >= 0) & (x_pixel < resolution) &
+        (y_pixel >= 0) & (y_pixel < resolution) &
+        (depth > 0)
+    )
+    points_2d = torch.stack([x_pixel, y_pixel], dim=1)
+    return points_2d, depth, valid_mask
+def sample_features(
+    fmap: torch.Tensor,
+    queries_ndc: torch.Tensor
+) -> torch.Tensor:
+    """
+    Sample features using grid_sample
+    Args:
+        fmap: [B, C, H, W], feature map
+        queries_ndc: [B, K, 2], NDC coordinates
+    Returns:
+        feat: [B, C, K], sampled features
+    """
+    B, C, H, W = fmap.shape
+    Bq, K, _ = queries_ndc.shape
+    assert Bq == B, "batch 不一致"
+    grid = queries_ndc.view(B, K, 1, 2)
+    feat = F.grid_sample(
+        fmap, grid, mode='bilinear',
+        align_corners=False, padding_mode='border'
+    )
+    return feat.squeeze(-1)
+# =============================================================================
+# Projection Grid Module
+# =============================================================================
+class ProjGrid(nn.Module):
+    """3D Grid Projection Module"""
+    def __init__(self, grid_resolution: int = 16):
+        super().__init__()
+        self.grid_resolution = grid_resolution
+        self.image_resolution = 518
+        # Create 3D grid points
+        one_dim = torch.linspace(-1, 1, grid_resolution)
+        x, y, z = torch.meshgrid(one_dim, one_dim, one_dim, indexing='ij')
+        grid_points = torch.stack((x, y, z), dim=-1)
+        # Rotation matrix (align with Blender)
+        rotation_matrix = torch.tensor([
+            [1.0, 0.0, 0.0],
+            [0.0, 0.0, -1.0],
+            [0.0, 1.0, 0.0]
+        ])
+        grid_points = torch.matmul(grid_points, rotation_matrix.T)
+        grid_points = grid_points.reshape(-1, 3)
+        self.register_buffer('grid_points', grid_points)
+        # Front view transformation matrix
+        front_view_transform_matrix = torch.tensor([
+            [1.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, -1.0, -2.0],
+            [0.0, 1.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 1.0]
+        ])
+        self.register_buffer(
+            "front_view_transform_matrix",
+            front_view_transform_matrix
+        )
+    def forward(
+        self,
+        features_map: torch.Tensor,
+        camera_angle_x: torch.Tensor,
+        distance: torch.Tensor,
+        mesh_scale: torch.Tensor,
+        transform_matrix: torch.Tensor = None,
+        BHWC: bool = True
+    ) -> torch.Tensor:
+        """
+        Project feature map to 3D grid
+        Args:
+            features_map: [B, H, W, C] or [B, C, H, W]
+            camera_angle_x: [B]
+            distance: [B]
+            mesh_scale: [B]
+            transform_matrix: [B, 4, 4] or None
+            BHWC: Whether input is in BHWC format
+        Returns:
+            x: [B, K, C], projected features
+        """
+        if BHWC:
+            B, H, W, C = features_map.shape
+        else:
+            B, C, H, W = features_map.shape
+        # Prepare grid points
+        grid_points = self.grid_points.expand(B, -1, -1)
+        grid_points = grid_points / mesh_scale.unsqueeze(-1).unsqueeze(-1) / 2
+        # Use default transformation matrix
+        if transform_matrix is None:
+            transform_matrix = self.front_view_transform_matrix
+            transform_matrix = transform_matrix.expand(B, -1, -1).clone()
+            transform_matrix[:, 1, 3] = -distance
+        # Project to image
+        image_points, depth, valid_mask = project_points_to_image_batch(
+            grid_points, transform_matrix, camera_angle_x, self.image_resolution
+        )
+        # Normalize to [-1, 1]
+        image_points_norm = (image_points + 0.5) / self.image_resolution * 2 - 1
+        # Adjust dimensions and sample
+        if BHWC:
+            features_map = features_map.permute(0, 3, 1, 2)
+        x = sample_features(features_map, image_points_norm)
+        x = x.permute(0, 2, 1)
+        return x
+# =============================================================================
+# DINOv2 Encoder with Projection
+# =============================================================================
+@pixal3d.register("dinov2-encoder-proj")
+class DinoEncoderProj(BaseModule, ModelMixin):
+    """DINOv2 Encoder with 3D Grid Projection"""
+    @dataclass
+    class Config(BaseModule.Config):
+        model: str = "facebookresearch/dinov2"
+        version: str = "dinov2_vitl14_reg"
+        size: int = 518
+        empty_embeds_ratio: float = 0.1
+        grid_resolution: int = 16
+        use_upsample: bool = False
+        use_geo_feats: bool = False
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+        self.grid_resolution = self.cfg.grid_resolution
+        self.empty_embeds_ratio = self.cfg.empty_embeds_ratio
+        self.use_upsample = self.cfg.use_upsample
+        # Load DINOv2
+        dino_model = torch.hub.load(
+            self.cfg.model, self.cfg.version, pretrained=True
+        )
+        self.encoder = dino_model.eval()
+        # Optional: load upsampler
+        if self.use_upsample:
+            upsampler = torch.hub.load("valeoai/NAF", "naf", pretrained=True)
+            self.upsampler = upsampler.eval()
+        # Image preprocessing (normalization only)
+        self.transform = transforms.Compose([
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225],
+            ),
+        ])
+        self.patch_size = self.encoder.patch_size
+        self.patch_number = self.cfg.size // self.patch_size
+        self.proj_grid = ProjGrid(grid_resolution=self.cfg.grid_resolution)
+    def forward(
+        self,
+        image: torch.Tensor,
+        image_mask: torch.Tensor = None,
+        camera_angle_x: torch.Tensor = None,
+        distance: torch.Tensor = None,
+        mesh_scale: torch.Tensor = None,
+        transform_matrix: torch.Tensor = None,
+        is_training: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass
+        Args:
+            image: [B, C, H, W]
+            camera_angle_x: [B]
+            distance: [B]
+            mesh_scale: [B]
+            is_training: Training mode flag
+        Returns:
+            z_global: [B, num_global, C]
+            z: [B, grid_resolution^3, C]
+        """
+        image = self.transform(image)
+        with torch.no_grad():
+            z = self.encoder(image, is_training=True)['x_prenorm']
+            z = F.layer_norm(z, z.shape[-1:])
+            # Split tokens
+            z_clstoken = z[:, 0:1]
+            z_regtokens = z[:, 1:self.encoder.num_register_tokens + 1]
+            z_patchtokens = z[:, 1 + self.encoder.num_register_tokens:]
+            z_patchtokens = z_patchtokens.reshape(
+                z_patchtokens.shape[0],
+                self.patch_number,
+                self.patch_number,
+                -1
+            )
+            # Project to grid
+            z = self.proj_grid(
+                z_patchtokens, camera_angle_x, distance, mesh_scale
+            )
+            # Optional: upsample and fuse
+            if self.use_upsample:
+                z_patchtokens_permuted = z_patchtokens.permute(0, 3, 1, 2)
+                z_upsampled = self.upsampler(
+                    image, z_patchtokens_permuted, output_size=(518, 518)
+                )
+                z_upsampled = self.proj_grid(
+                    z_upsampled, camera_angle_x, distance, mesh_scale, BHWC=False
+                )
+                z = z + z_upsampled
+        # Global tokens
+        z_global = torch.cat([z_clstoken, z_regtokens], dim=1)
+        z_global = z_global.expand(z.shape[0], -1, -1)
+        # Classifier-free guidance: random drop
+        if is_training and random.random() < self.empty_embeds_ratio:
+            z_global = z_global * 0
+            z = z * 0
+        return z_global, z
+# =============================================================================
+# Multi-View Projection Encoder Helper Functions
+# =============================================================================
+def compute_calc_mat(
+    true_view_mat: torch.Tensor,
+    ext_true_view_mat: torch.Tensor,
+    fix_mat: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compute calc_mat using matrix relative transformation
+    Args:
+        true_view_mat: [B, 1, 4, 4], ground truth camera matrix
+        ext_true_view_mat: [B, N, 4, 4], extended ground truth camera matrices
+        fix_mat: [B, 1, 4, 4], fixed matrix
+    Returns:
+        calc_mat: [B, N, 4, 4]
+        relative_transform: [B, N, 4, 4]
+    """
+    B, N = ext_true_view_mat.shape[:2]
+    # Expand to [B, N, 4, 4]
+    true_view_mat_exp = true_view_mat.expand(B, N, 4, 4)
+    fix_mat_exp = fix_mat.expand(B, N, 4, 4)
+    # Flatten to [B*N, 4, 4]
+    true_view_mat_flat = true_view_mat_exp.reshape(B * N, 4, 4)
+    ext_true_view_mat_flat = ext_true_view_mat.reshape(B * N, 4, 4)
+    fix_mat_flat = fix_mat_exp.reshape(B * N, 4, 4)
+    # Compute relative transformation (disable autocast for fp32 precision)
+    with torch.amp.autocast('cuda', enabled=False):
+        true_view_mat_flat = true_view_mat_flat.float()
+        ext_true_view_mat_flat = ext_true_view_mat_flat.float()
+        fix_mat_flat = fix_mat_flat.float()
+        relative_transform_flat = torch.bmm(
+            torch.linalg.inv(true_view_mat_flat),
+            ext_true_view_mat_flat
+        )
+        calc_mat_flat = torch.bmm(fix_mat_flat, relative_transform_flat)
+    calc_mat = calc_mat_flat.view(B, N, 4, 4)
+    relative_transform = relative_transform_flat.view(B, N, 4, 4)
+    return calc_mat, relative_transform
+# =============================================================================
+# Multi-View DINOv2 Projection Encoder
+# =============================================================================
+@pixal3d.register("dinov2-encoder-proj-multi-view")
+class DinoEncoderProjMultiView(BaseModule, ModelMixin):
+    """Multi-View DINOv2 Projection Encoder"""
+    @dataclass
+    class Config(BaseModule.Config):
+        model: str = "facebookresearch/dinov2"
+        version: str = "dinov2_vitl14_reg"
+        size: int = 518
+        empty_embeds_ratio: float = 0.1
+        grid_resolution: int = 16
+        use_upsample: bool = False
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+        self.grid_resolution = self.cfg.grid_resolution
+        self.empty_embeds_ratio = self.cfg.empty_embeds_ratio
+        self.use_upsample = self.cfg.use_upsample
+        # Load DINOv2
+        dino_model = torch.hub.load(
+            self.cfg.model, self.cfg.version, pretrained=True
+        )
+        self.encoder = dino_model.eval()
+        # Optional: upsampler
+        if self.use_upsample:
+            upsampler = torch.hub.load("valeoai/NAF", "naf", pretrained=True)
+            self.upsampler = upsampler.eval()
+        # Image preprocessing
+        self.transform = transforms.Compose([
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225],
+            ),
+        ])
+        self.patch_size = self.encoder.patch_size
+        self.patch_number = self.cfg.size // self.patch_size
+        self.proj_grid = ProjGrid(grid_resolution=self.cfg.grid_resolution)
+        # Fixed transformation matrix
+        self.register_buffer("fix_transform_matrix", torch.tensor([
+            [1.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, -1.0, -2.0],
+            [0.0, 1.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 1.0]
+        ]))
+    def forward(
+        self,
+        image: torch.Tensor,
+        image_mask: torch.Tensor = None,
+        camera_angle_x: torch.Tensor = None,
+        distance: torch.Tensor = None,
+        mesh_scale: torch.Tensor = None,
+        transform_matrix: torch.Tensor = None,
+        is_training: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass
+        Args:
+            image: [B, num_views, C, H, W]
+            camera_angle_x: [B, num_views]
+            distance: [B, num_views]
+            mesh_scale: [B]
+            transform_matrix: [B, num_views, 4, 4]
+        Returns:
+            z_global: [B, num_global, C]
+            z: [B, grid_resolution^3, C]
+        """
+        B, num_views, C, H, W = image.shape
+        image = image.reshape(B * num_views, C, H, W)
+        image = self.transform(image)
+        with torch.no_grad():
+            z = self.encoder(image, is_training=True)['x_prenorm']
+            z = F.layer_norm(z, z.shape[-1:])
+            z_clstoken = z[:, 0:1]
+            z_regtokens = z[:, 1:self.encoder.num_register_tokens + 1]
+            z_patchtokens = z[:, 1 + self.encoder.num_register_tokens:]
+            z_patchtokens = z_patchtokens.reshape(
+                z_patchtokens.shape[0],
+                self.patch_number,
+                self.patch_number,
+                -1
+            )
+        # Compute relative transformation
+        calc_mat, relative_transform = self.get_relative_transform(
+            transform_matrix, distance
+        )
+        calc_mat = calc_mat.reshape(B * num_views, 4, 4)
+        # Prepare parameters
+        init_mesh_scale = mesh_scale[:, None].expand(B, num_views).reshape(B * num_views)
+        camera_angle_x_flat = camera_angle_x.reshape(B * num_views)
+        distance_flat = distance.reshape(B * num_views)
+        # Accumulate per-view (avoid OOM)
+        z_accumulated = None
+        z_patchtokens_permuted = z_patchtokens.permute(0, 3, 1, 2) if self.use_upsample else None
+        with torch.no_grad():
+            for view_idx in range(num_views):
+                indices = torch.arange(
+                    view_idx, B * num_views, num_views, device=z_patchtokens.device
+                )
+                # Project current view
+                z_view = self.proj_grid(
+                    z_patchtokens[indices],
+                    camera_angle_x_flat[indices],
+                    distance_flat[indices],
+                    init_mesh_scale[indices],
+                    calc_mat[indices]
+                )
+                # Optional: upsample
+                if self.use_upsample:
+                    chunk_upsampled = self.upsampler(
+                        image[indices],
+                        z_patchtokens_permuted[indices],
+                        output_size=(518, 518)
+                    )
+                    chunk_proj = self.proj_grid(
+                        chunk_upsampled,
+                        camera_angle_x_flat[indices],
+                        distance_flat[indices],
+                        init_mesh_scale[indices],
+                        calc_mat[indices],
+                        BHWC=False
+                    )
+                    z_view = z_view + chunk_proj
+                    del chunk_upsampled, chunk_proj
+                # Accumulate
+                if z_accumulated is None:
+                    z_accumulated = z_view.clone()
+                else:
+                    z_accumulated = z_accumulated + z_view
+                del z_view
+        if z_patchtokens_permuted is not None:
+            del z_patchtokens_permuted
+        # Average
+        z = z_accumulated / num_views
+        # Average global tokens
+        z_global = torch.cat([z_clstoken, z_regtokens], dim=1)
+        z_global = z_global.reshape(B, num_views, z_global.shape[-2], z_global.shape[-1])
+        z_global = z_global.mean(dim=1)
+        # Classifier-free guidance
+        if is_training and random.random() < self.empty_embeds_ratio:
+            z_global = z_global * 0
+            z = z * 0
+        return z_global, z
+    def get_relative_transform(
+        self,
+        transform_matrix: torch.Tensor,
+        distance: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Compute relative transformation matrix
+        Args:
+            transform_matrix: [B, num_views, 4, 4]
+            distance: [B, num_views]
+        Returns:
+            calc_mat: [B, num_views, 4, 4]
+            relative_transform: [B, num_views, 4, 4]
+        """
+        B, num_views, _, _ = transform_matrix.shape
+        init_transform_matrix = transform_matrix[:, 0:1]
+        fix_transform_matrix = self.fix_transform_matrix.unsqueeze(0).expand(B, -1, -1).clone()
+        init_distance = distance[:, 0]
+        fix_transform_matrix[:, 1, 3] = -init_distance
+        fix_transform_matrix = fix_transform_matrix.unsqueeze(1)
+        calc_mat, relative_transform = compute_calc_mat(
+            init_transform_matrix, transform_matrix, fix_transform_matrix
+        )
+        return calc_mat, relative_transform

pixal3d/models/transformers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from . import sparse_dit
2	+ from . import dense_dit

pixal3d/models/transformers/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (253 Bytes). View file

pixal3d/models/transformers/__pycache__/dense_dit.cpython-310.pyc ADDED Viewed

Binary file (9.49 kB). View file

pixal3d/models/transformers/__pycache__/sparse_dit.cpython-310.pyc ADDED Viewed

Binary file (13 kB). View file

pixal3d/models/transformers/dense_dit.py ADDED Viewed

	@@ -0,0 +1,298 @@

+from typing import *
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from ...modules.utils import convert_module_to_f16, convert_module_to_f32
+from ...modules.transformer import AbsolutePositionEmbedder, ModulatedTransformerCrossBlock
+from ...modules.spatial import patchify, unpatchify
+from ...utils.base import BaseModule
+import pixal3d
+from huggingface_hub import hf_hub_download
+import os
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        Args:
+            t: a 1-D Tensor of N indices, one per batch element.
+                These may be fractional.
+            dim: the dimension of the output.
+            max_period: controls the minimum frequency of the embeddings.
+        Returns:
+            an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -np.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_freq = t_freq.to(self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class DenseDiT(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        model_channels: int,
+        cond_channels: int,
+        out_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        patch_size: int = 2,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        share_mod: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        latent_shape: list = [8, 16, 16, 16],
+        image_attn_mode:str = "cross",
+        load_ckpt:bool = True,
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.cond_channels = cond_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.patch_size = patch_size
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.qk_rms_norm = qk_rms_norm
+        self.qk_rms_norm_cross = qk_rms_norm_cross
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.latent_shape = latent_shape
+        self.image_attn_mode = image_attn_mode
+        self.t_embedder = TimestepEmbedder(model_channels)
+        if share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(model_channels, 6 * model_channels, bias=True)
+            )
+        if pe_mode == "ape":
+            pos_embedder = AbsolutePositionEmbedder(model_channels, 3)
+            coords = torch.meshgrid(*[torch.arange(res, device=self.device) for res in [resolution // patch_size] * 3], indexing='ij')
+            coords = torch.stack(coords, dim=-1).reshape(-1, 3)
+            pos_emb = pos_embedder(coords)
+            self.register_buffer("pos_emb", pos_emb)
+        self.input_layer = nn.Linear(in_channels * patch_size**3, model_channels)
+        self.blocks = nn.ModuleList([
+            ModulatedTransformerCrossBlock(
+                model_channels,
+                cond_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode='full',
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                share_mod=share_mod,
+                qk_rms_norm=self.qk_rms_norm,
+                qk_rms_norm_cross=self.qk_rms_norm_cross,
+                image_attn_mode = self.image_attn_mode,
+            )
+            for _ in range(num_blocks)
+        ])
+        self.out_layer = nn.Linear(model_channels, out_channels * patch_size**3)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        # self.blocks.apply(convert_module_to_f16)
+        self.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.blocks.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        if self.share_mod:
+            nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+        else:
+            for block in self.blocks:
+                nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+                nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def forward(self, x: torch.Tensor, t: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
+        assert [*x.shape] == [x.shape[0], self.in_channels, *[self.resolution] * 3], \
+                f"Input shape mismatch, got {x.shape}, expected {[x.shape[0], self.in_channels, *[self.resolution] * 3]}"
+        h = patchify(x, self.patch_size)
+        h = h.view(*h.shape[:2], -1).permute(0, 2, 1).contiguous()
+        h = self.input_layer(h)
+        h = h + self.pos_emb[None]
+        t_emb = self.t_embedder(t)
+        if self.share_mod:
+            t_emb = self.adaLN_modulation(t_emb)
+        t_emb = t_emb.type(self.dtype)
+        h = h.type(self.dtype)
+        if self.image_attn_mode=='proj':
+            global_cond,proj_cond = cond
+            global_cond = global_cond.type(self.dtype)
+            proj_cond = proj_cond.type(self.dtype)
+            cond = (global_cond, proj_cond)
+        else:
+            cond = cond.type(self.dtype)
+        for block in self.blocks:
+            h = block(h, t_emb, cond)
+        h = h.type(x.dtype)
+        h = F.layer_norm(h, h.shape[-1:])
+        h = self.out_layer(h)
+        h = h.permute(0, 2, 1).view(h.shape[0], h.shape[2], *[self.resolution // self.patch_size] * 3)
+        h = unpatchify(h, self.patch_size).contiguous()
+        return h
+# ===== Align to sparse_dit style: ModelOutput + Denoiser wrapper (Lightning-friendly) =====
+@dataclass
+class DenseDiTModelOutput:
+    sample: torch.Tensor
+@pixal3d.register("dense-dit-denoiser")
+class DenseDiTDenoiser(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        # Mirror DenseDiT init signature with reasonable defaults
+        resolution: int = 64
+        in_channels: int = 16
+        model_channels: int = 1024
+        cond_channels: int = 1024
+        out_channels: int = 16
+        num_blocks: int = 24
+        num_heads: Optional[int] = None
+        num_head_channels: Optional[int] = 64
+        mlp_ratio: float = 4.0
+        patch_size: int = 2
+        pe_mode: str = "ape"  # "ape" | "rope"
+        use_fp16: bool = False
+        use_checkpoint: bool = False
+        share_mod: bool = False
+        qk_rms_norm: bool = False
+        qk_rms_norm_cross: bool = False
+        latent_shape: list = (8, 16, 16, 16)
+        image_attn_mode: str = "cross"
+        load_ckpt:bool = True
+    cfg: Config
+    def configure(self) -> None:
+        # Instantiate the underlying DenseDiT model
+        self.dit_model = DenseDiT(
+            resolution=self.cfg.resolution,
+            in_channels=self.cfg.in_channels,
+            model_channels=self.cfg.model_channels,
+            cond_channels=self.cfg.cond_channels,
+            out_channels=self.cfg.out_channels,
+            num_blocks=self.cfg.num_blocks,
+            num_heads=self.cfg.num_heads,
+            num_head_channels=self.cfg.num_head_channels,
+            mlp_ratio=self.cfg.mlp_ratio,
+            patch_size=self.cfg.patch_size,
+            pe_mode=self.cfg.pe_mode,
+            use_fp16=self.cfg.use_fp16,
+            use_checkpoint=self.cfg.use_checkpoint,
+            share_mod=self.cfg.share_mod,
+            qk_rms_norm=self.cfg.qk_rms_norm,
+            qk_rms_norm_cross=self.cfg.qk_rms_norm_cross,
+            latent_shape=list(self.cfg.latent_shape) if isinstance(self.cfg.latent_shape, (list, tuple)) else self.cfg.latent_shape,
+            image_attn_mode=self.cfg.image_attn_mode,
+            load_ckpt=self.cfg.load_ckpt,
+        )
+        # For a consistent external API (some systems may read out_channels)
+        self.out_channels = self.cfg.out_channels
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        cond: torch.Tensor,
+        **kwargs,
+    ) -> DenseDiTModelOutput:
+        """Forward wrapper returning a structured output like diffusers models.
+        Args:
+            x: [B, C, D, H, W] dense latent tensor.
+            t: [B] or [1] timestep tensor.
+            cond: conditioning tensor matching the transformer blocks' expected dims.
+        """
+        out = self.dit_model(x, t, cond)
+        return DenseDiTModelOutput(sample=out)

pixal3d/models/transformers/sparse_dit.py ADDED Viewed

	@@ -0,0 +1,469 @@

+# Some parts of this file are adapted from the SparseDiT implementation
+import os
+from typing import Any, Dict, Optional, Union, Tuple, Literal
+from dataclasses import dataclass
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import PeftAdapterMixin
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import logging
+import pixal3d
+from pixal3d.utils.base import BaseModule
+from huggingface_hub import hf_hub_download
+# Import sparse operations
+from ...modules import sparse as sp
+from ...modules.utils import convert_module_to_f16, convert_module_to_f32
+from ...modules.transformer import AbsolutePositionEmbedder
+from ...modules.sparse.transformer.modulated import ModulatedSparseTransformerCrossBlock
+SPARSE_AVAILABLE = True
+# except ImportError:
+    # print("Warning: sparse modules not found. Please ensure it's in your Python path.")
+    # sp = None
+    # convert_module_to_f16 = None
+    # convert_module_to_f32 = None
+    # AbsolutePositionEmbedder = None
+    # ModulatedSparseTransformerCrossBlock = None
+    # SPARSE_AVAILABLE = False
+logger = logging.get_logger(__name__)
+@dataclass
+class SparseDiTModelOutput:
+    sample: Any  # Can be torch.FloatTensor or sp.SparseTensor
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        """
+        half = dim // 2
+        freqs = torch.exp(
+            -np.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_freq = t_freq.to(self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class SparseDiTModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    """
+    Sparse Diffusion Transformer model for 3D shape generation.
+    This model processes sparse 3D data using sparse attention mechanisms.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        resolution: int = 64,
+        in_channels: int = 16,
+        model_channels: int = 1024,
+        cond_channels: int = 1024,
+        out_channels: int = 16,
+        num_blocks: int = 24,
+        num_heads: int = 32,
+        num_head_channels: int = 64,
+        num_kv_heads: int = 2,
+        compression_block_size: int = 4,
+        selection_block_size: int = 8,
+        topk: int = 32,
+        compression_version: str = 'v2',
+        mlp_ratio: float = 4.0,
+        pe_mode: str = "ape",
+        use_fp16: bool = True,
+        use_checkpoint: bool = True,
+        share_mod: bool = False,
+        qk_rms_norm: bool = True,
+        qk_rms_norm_cross: bool = False,
+        sparse_conditions: bool = True,
+        factor: float = 1.0,
+        window_size: int = 8,
+        use_shift: bool = True,
+        image_attn_mode:str='cross',
+        load_ckpt:bool=True,
+        version:Optional[str]='V10',
+    ):
+        super().__init__()
+        if not SPARSE_AVAILABLE:
+            raise ImportError("sparse modules not found.")
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.cond_channels = cond_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.num_heads = num_heads  or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.qk_rms_norm = qk_rms_norm
+        self.qk_rms_norm_cross = qk_rms_norm_cross
+        self._dtype = torch.float16 if use_fp16 else torch.float32
+        self.sparse_conditions = sparse_conditions
+        self.factor = factor
+        self.compression_block_size = compression_block_size
+        self.selection_block_size = selection_block_size
+        self.image_attn_mode = image_attn_mode
+        # Timestep embedding
+        self.t_embedder = TimestepEmbedder(model_channels)
+        # Shared modulation if enabled
+        if share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(model_channels, 6 * model_channels, bias=True)
+            )
+        # Condition processing for sparse conditions
+        if sparse_conditions:
+            self.cond_proj = sp.SparseLinear(cond_channels, cond_channels)
+            self.pos_embedder_cond = AbsolutePositionEmbedder(model_channels, in_channels=3)
+        # Position embedding
+        if pe_mode == "ape":
+            self.pos_embedder = AbsolutePositionEmbedder(model_channels)
+        # Input projection
+        self.input_layer = sp.SparseLinear(in_channels, model_channels)
+        # Transformer blocks
+        self.blocks = nn.ModuleList([
+            ModulatedSparseTransformerCrossBlock(
+                model_channels,
+                cond_channels,
+                num_heads=self.num_heads,
+                num_kv_heads=num_kv_heads,
+                compression_block_size=compression_block_size,
+                selection_block_size=selection_block_size,
+                topk=topk,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode='full',
+                compression_version=compression_version,
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                share_mod=self.share_mod,
+                qk_rms_norm=self.qk_rms_norm,
+                qk_rms_norm_cross=self.qk_rms_norm_cross,
+                resolution=resolution,
+                window_size=window_size,
+                shift_window=window_size // 2 * (i % 2) if use_shift else window_size // 2,
+                image_attn_mode = image_attn_mode,
+            )
+            for i in range(num_blocks)
+        ])
+        # Output projection
+        self.out_layer = sp.SparseLinear(model_channels, out_channels)
+        # Initialize weights
+        self.initialize_weights()
+        self.gradient_checkpointing = False
+        if use_fp16:
+            print("Converting model to float16 ============================")
+            self.convert_to_fp16()
+        # else:
+            # self.convert_to_fp32()
+    @property
+    def device(self) -> torch.device:
+        """Return the device of the model."""
+        return next(self.parameters()).device
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def convert_to_fp16(self) -> None:
+        """Convert the model to float16."""
+        self.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """Convert the model to float32."""
+        self.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        """Initialize model weights."""
+        # Initialize transformer layers
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers
+        if self.share_mod:
+            nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+        else:
+            for block in self.blocks:
+                # if hasattr(block, 'adaLN_modulation'):
+                    nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+                    nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def forward(
+        self,
+        hidden_states: Any,  # sp.SparseTensor
+        timestep: torch.Tensor,
+        encoder_hidden_states: Optional[Any] = None,  # torch.Tensor or sp.SparseTensor
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[SparseDiTModelOutput, Tuple]:
+        """
+        Forward pass of the SparseDiT model.
+        Args:
+            hidden_states: Input sparse tensor
+            timestep: Timestep tensor
+            encoder_hidden_states: Condition tensor (visual/text conditions)
+            attention_kwargs: Additional attention arguments
+            return_dict: Whether to return a dictionary
+        """
+        # breakpoint()
+        # Process input
+        assert attention_kwargs is None, "attention_kwargs not supported in SparseDiT"
+        # breakpoint()
+        h = self.input_layer(hidden_states).type(self._dtype)
+        # Process timestep
+        t_emb = self.t_embedder(timestep)
+        if self.share_mod:
+            t_emb = self.adaLN_modulation(t_emb)
+        t_emb = t_emb.type(self._dtype)
+        # Process conditions
+        cond = encoder_hidden_states
+        if self.image_attn_mode=='proj':
+            global_cond,sparse_cond = cond
+            if sparse_cond is not None:
+                sparse_cond = sparse_cond.type(self._dtype)
+                global_cond = global_cond.type(self._dtype)
+                # breakpoint()
+                if self.sparse_conditions and isinstance(sparse_cond, sp.SparseTensor):
+                    # breakpoint()
+                    sparse_cond = self.cond_proj(sparse_cond)
+                    sparse_cond = sparse_cond + self.pos_embedder_cond(sparse_cond.coords[:, 1:]).type(self._dtype)
+                cond = (global_cond,sparse_cond)
+        else:
+            if self.sparse_conditions:
+                cond = self.cond_proj(cond)
+                cond = cond + self.pos_embedder_cond(cond.coords[:, 1:]).type(self.dtype)
+        # Add positional embeddings
+        if self.pe_mode == "ape":
+            h = h + self.pos_embedder(h.coords[:, 1:], factor=self.factor).type(self._dtype)
+        # Process through transformer blocks
+        for block in self.blocks:
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                h = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    h, t_emb, cond
+                )
+            else:
+                h = block(h, t_emb, cond)
+        # Final layer norm and output projection
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = self.out_layer(h.type(hidden_states.dtype))
+        if not return_dict:
+            return (h,)
+        return SparseDiTModelOutput(sample=h)
+@pixal3d.register("sparse-dit-denoiser")
+class SparseDiTDenoiser(BaseModule):
+    """
+    Sparse DiT Denoiser wrapper for pixal3d framework.
+    """
+    @dataclass
+    class Config(BaseModule.Config):
+        # Model architecture
+        resolution: int = 64
+        in_channels: int = 16
+        model_channels: int = 1024
+        cond_channels: int = 1024
+        out_channels: int = 16
+        num_blocks: int = 24
+        num_heads: int = 32
+        num_kv_heads: int = 2
+        compression_block_size: int = 4
+        selection_block_size: int = 8
+        topk: int = 32
+        compression_version: str = 'v2'
+        mlp_ratio: float = 4.0
+        pe_mode: str = "ape"
+        use_fp16: bool = True
+        use_checkpoint: bool = True
+        qk_rms_norm: bool = True
+        qk_rms_norm_cross: bool = False
+        sparse_conditions: bool = True
+        factor: float = 1.0
+        window_size: int = 8
+        use_shift: bool = True
+        # Condition settings
+        use_visual_condition: bool = True
+        visual_condition_dim: int = 1024
+        use_caption_condition: bool = False
+        caption_condition_dim: int = 1024
+        use_label_condition: bool = False
+        label_condition_dim: int = 1024
+        # Training settings
+        pretrained_model_name_or_path: Optional[str] = None
+        image_attn_mode:Optional[str]='cross'
+        load_ckpt:bool =True
+        version:Optional[str]='V10'
+    cfg: Config
+    def configure(self) -> None:
+        """Configure the SparseDiT model."""
+        # Create the core SparseDiT model
+        self.dit_model = SparseDiTModel(
+            resolution=self.cfg.resolution,
+            in_channels=self.cfg.in_channels,
+            model_channels=self.cfg.model_channels,
+            cond_channels=self.cfg.cond_channels,
+            out_channels=self.cfg.out_channels,
+            num_blocks=self.cfg.num_blocks,
+            num_heads=self.cfg.num_heads,
+            num_kv_heads=self.cfg.num_kv_heads,
+            compression_block_size=self.cfg.compression_block_size,
+            selection_block_size=self.cfg.selection_block_size,
+            topk=self.cfg.topk,
+            compression_version=self.cfg.compression_version,
+            mlp_ratio=self.cfg.mlp_ratio,
+            pe_mode=self.cfg.pe_mode,
+            use_fp16=self.cfg.use_fp16,
+            use_checkpoint=self.cfg.use_checkpoint,
+            sparse_conditions=self.cfg.sparse_conditions,
+            factor=self.cfg.factor,
+            window_size=self.cfg.window_size,
+            use_shift=self.cfg.use_shift,
+            image_attn_mode=self.cfg.image_attn_mode,
+            load_ckpt = self.cfg.load_ckpt,
+            version=self.cfg.version,
+        )
+        # Condition projectors
+        if self.cfg.use_visual_condition and self.cfg.visual_condition_dim != self.cfg.cond_channels:
+            self.proj_visual_condition = nn.Sequential(
+                nn.RMSNorm(self.cfg.visual_condition_dim),
+                nn.Linear(self.cfg.visual_condition_dim, self.cfg.cond_channels),
+            )
+        if self.cfg.use_caption_condition and self.cfg.caption_condition_dim != self.cfg.cond_channels:
+            self.proj_caption_condition = nn.Sequential(
+                nn.RMSNorm(self.cfg.caption_condition_dim),
+                nn.Linear(self.cfg.caption_condition_dim, self.cfg.cond_channels),
+            )
+        if self.cfg.use_label_condition and self.cfg.label_condition_dim != self.cfg.cond_channels:
+            self.proj_label_condition = nn.Sequential(
+                nn.RMSNorm(self.cfg.label_condition_dim),
+                nn.Linear(self.cfg.label_condition_dim, self.cfg.cond_channels),
+            )
+        # Load pretrained weights if specified
+        if self.cfg.pretrained_model_name_or_path:
+            print(f"Loading pretrained SparseDiT model from {self.cfg.pretrained_model_name_or_path}")
+            ckpt = torch.load(
+                self.cfg.pretrained_model_name_or_path,
+                map_location="cpu",
+                weights_only=True,
+            )
+            if "state_dict" in ckpt.keys():
+                ckpt = ckpt["state_dict"]
+            self.load_state_dict(ckpt, strict=True)
+    def forward(
+        self,
+        x: Any,  # sp.SparseTensor
+        t: torch.Tensor,
+        cond: Optional[Any] = None,
+    ):
+        """
+        Forward pass of the denoiser.
+        Args:
+            model_input: Input sparse tensor [SparseTensor with features]
+            timestep: Timestep tensor [batch_size,]
+            visual_condition: Visual condition tensor
+            caption_condition: Caption condition tensor
+            label_condition: Label condition tensor
+            attention_kwargs: Additional attention arguments
+            return_dict: Whether to return a dictionary
+        """
+        output = self.dit_model(
+            hidden_states=x,
+            timestep=t,
+            encoder_hidden_states=cond,
+        )
+        return output

pixal3d/modules/__pycache__/norm.cpython-310.pyc ADDED Viewed

Binary file (1.43 kB). View file

pixal3d/modules/__pycache__/spatial.cpython-310.pyc ADDED Viewed

Binary file (2.49 kB). View file

pixal3d/modules/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (1.55 kB). View file

pixal3d/modules/attention/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from typing import *
+BACKEND = 'flash_attn'
+DEBUG = False
+def __from_env():
+    import os
+    global BACKEND
+    global DEBUG
+    env_attn_backend = os.environ.get('ATTN_BACKEND')
+    env_sttn_debug = os.environ.get('ATTN_DEBUG')
+    if env_attn_backend is not None and env_attn_backend in ['xformers', 'flash_attn', 'sdpa', 'naive']:
+        BACKEND = env_attn_backend
+    if env_sttn_debug is not None:
+        DEBUG = env_sttn_debug == '1'
+    print(f"[ATTENTION] Using backend: {BACKEND}")
+__from_env()
+def set_backend(backend: Literal['xformers', 'flash_attn']):
+    global BACKEND
+    BACKEND = backend
+def set_debug(debug: bool):
+    global DEBUG
+    DEBUG = debug
+from .full_attn import *
+from .modules import *

pixal3d/modules/attention/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (940 Bytes). View file

pixal3d/modules/attention/__pycache__/full_attn.cpython-310.pyc ADDED Viewed

Binary file (4.15 kB). View file

pixal3d/modules/attention/__pycache__/modules.cpython-310.pyc ADDED Viewed

Binary file (6.21 kB). View file

pixal3d/modules/attention/full_attn.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from typing import *
+import torch
+import math
+from . import DEBUG, BACKEND
+if BACKEND == 'xformers':
+    import xformers.ops as xops
+elif BACKEND == 'flash_attn':
+    import flash_attn
+elif BACKEND == 'sdpa':
+    from torch.nn.functional import scaled_dot_product_attention as sdpa
+elif BACKEND == 'naive':
+    pass
+else:
+    raise ValueError(f"Unknown attention backend: {BACKEND}")
+__all__ = [
+    'scaled_dot_product_attention',
+]
+def _naive_sdpa(q, k, v):
+    """
+    Naive implementation of scaled dot product attention.
+    """
+    q = q.permute(0, 2, 1, 3)   # [N, H, L, C]
+    k = k.permute(0, 2, 1, 3)   # [N, H, L, C]
+    v = v.permute(0, 2, 1, 3)   # [N, H, L, C]
+    scale_factor = 1 / math.sqrt(q.size(-1))
+    attn_weight = q @ k.transpose(-2, -1) * scale_factor
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    out = attn_weight @ v
+    out = out.permute(0, 2, 1, 3)   # [N, L, H, C]
+    return out
+@overload
+def scaled_dot_product_attention(qkv: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        qkv (torch.Tensor): A [N, L, 3, H, C] tensor containing Qs, Ks, and Vs.
+    """
+    ...
+@overload
+def scaled_dot_product_attention(q: torch.Tensor, kv: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        q (torch.Tensor): A [N, L, H, C] tensor containing Qs.
+        kv (torch.Tensor): A [N, L, 2, H, C] tensor containing Ks and Vs.
+    """
+    ...
+@overload
+def scaled_dot_product_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        q (torch.Tensor): A [N, L, H, Ci] tensor containing Qs.
+        k (torch.Tensor): A [N, L, H, Ci] tensor containing Ks.
+        v (torch.Tensor): A [N, L, H, Co] tensor containing Vs.
+    Note:
+        k and v are assumed to have the same coordinate map.
+    """
+    ...
+def scaled_dot_product_attention(*args, **kwargs):
+    arg_names_dict = {
+        1: ['qkv'],
+        2: ['q', 'kv'],
+        3: ['q', 'k', 'v']
+    }
+    num_all_args = len(args) + len(kwargs)
+    assert num_all_args in arg_names_dict, f"Invalid number of arguments, got {num_all_args}, expected 1, 2, or 3"
+    for key in arg_names_dict[num_all_args][len(args):]:
+        assert key in kwargs, f"Missing argument {key}"
+    if num_all_args == 1:
+        qkv = args[0] if len(args) > 0 else kwargs['qkv']
+        assert len(qkv.shape) == 5 and qkv.shape[2] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, L, 3, H, C]"
+        device = qkv.device
+    elif num_all_args == 2:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        kv = args[1] if len(args) > 1 else kwargs['kv']
+        assert q.shape[0] == kv.shape[0], f"Batch size mismatch, got {q.shape[0]} and {kv.shape[0]}"
+        assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, C]"
+        assert len(kv.shape) == 5, f"Invalid shape for kv, got {kv.shape}, expected [N, L, 2, H, C]"
+        device = q.device
+    elif num_all_args == 3:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        k = args[1] if len(args) > 1 else kwargs['k']
+        v = args[2] if len(args) > 2 else kwargs['v']
+        assert q.shape[0] == k.shape[0] == v.shape[0], f"Batch size mismatch, got {q.shape[0]}, {k.shape[0]}, and {v.shape[0]}"
+        assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, Ci]"
+        assert len(k.shape) == 4, f"Invalid shape for k, got {k.shape}, expected [N, L, H, Ci]"
+        assert len(v.shape) == 4, f"Invalid shape for v, got {v.shape}, expected [N, L, H, Co]"
+        device = q.device
+    if BACKEND == 'xformers':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        out = xops.memory_efficient_attention(q, k, v)
+    elif BACKEND == 'flash_attn':
+        if num_all_args == 1:
+            out = flash_attn.flash_attn_qkvpacked_func(qkv)
+        elif num_all_args == 2:
+            out = flash_attn.flash_attn_kvpacked_func(q, kv)
+        elif num_all_args == 3:
+            out = flash_attn.flash_attn_func(q, k, v)
+    elif BACKEND == 'sdpa':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        q = q.permute(0, 2, 1, 3)   # [N, H, L, C]
+        k = k.permute(0, 2, 1, 3)   # [N, H, L, C]
+        v = v.permute(0, 2, 1, 3)   # [N, H, L, C]
+        out = sdpa(q, k, v)         # [N, H, L, C]
+        out = out.permute(0, 2, 1, 3)   # [N, L, H, C]
+    elif BACKEND == 'naive':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        out = _naive_sdpa(q, k, v)
+    else:
+        raise ValueError(f"Unknown attention module: {BACKEND}")
+    return out

pixal3d/modules/attention/modules.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .full_attn import scaled_dot_product_attention
+class MultiHeadRMSNorm(nn.Module):
+    def __init__(self, dim: int, heads: int):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(heads, dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return (F.normalize(x.float(), dim = -1) * self.gamma * self.scale).to(x.dtype)
+class RotaryPositionEmbedder(nn.Module):
+    def __init__(self, hidden_size: int, in_channels: int = 3):
+        super().__init__()
+        assert hidden_size % 2 == 0, "Hidden size must be divisible by 2"
+        self.hidden_size = hidden_size
+        self.in_channels = in_channels
+        self.freq_dim = hidden_size // in_channels // 2
+        self.freqs = torch.arange(self.freq_dim, dtype=torch.float32) / self.freq_dim
+        self.freqs = 1.0 / (10000 ** self.freqs)
+    def _get_phases(self, indices: torch.Tensor) -> torch.Tensor:
+        self.freqs = self.freqs.to(indices.device)
+        phases = torch.outer(indices, self.freqs)
+        phases = torch.polar(torch.ones_like(phases), phases)
+        return phases
+    def _rotary_embedding(self, x: torch.Tensor, phases: torch.Tensor) -> torch.Tensor:
+        x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        x_rotated = x_complex * phases
+        x_embed = torch.view_as_real(x_rotated).reshape(*x_rotated.shape[:-1], -1).to(x.dtype)
+        return x_embed
+    def forward(self, q: torch.Tensor, k: torch.Tensor, indices: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            q (sp.SparseTensor): [..., N, D] tensor of queries
+            k (sp.SparseTensor): [..., N, D] tensor of keys
+            indices (torch.Tensor): [..., N, C] tensor of spatial positions
+        """
+        if indices is None:
+            indices = torch.arange(q.shape[-2], device=q.device)
+            if len(q.shape) > 2:
+                indices = indices.unsqueeze(0).expand(q.shape[:-2] + (-1,))
+        phases = self._get_phases(indices.reshape(-1)).reshape(*indices.shape[:-1], -1)
+        if phases.shape[1] < self.hidden_size // 2:
+            phases = torch.cat([phases, torch.polar(
+                torch.ones(*phases.shape[:-1], self.hidden_size // 2 - phases.shape[1], device=phases.device),
+                torch.zeros(*phases.shape[:-1], self.hidden_size // 2 - phases.shape[1], device=phases.device)
+            )], dim=-1)
+        q_embed = self._rotary_embedding(q, phases)
+        k_embed = self._rotary_embedding(k, phases)
+        return q_embed, k_embed
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        ctx_channels: Optional[int]=None,
+        type: Literal["self", "cross"] = "self",
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        qkv_bias: bool = True,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__()
+        assert channels % num_heads == 0
+        assert type in ["self", "cross"], f"Invalid attention type: {type}"
+        assert attn_mode in ["full", "windowed"], f"Invalid attention mode: {attn_mode}"
+        assert type == "self" or attn_mode == "full", "Cross-attention only supports full attention"
+        if attn_mode == "windowed":
+            raise NotImplementedError("Windowed attention is not yet implemented")
+        self.channels = channels
+        self.head_dim = channels // num_heads
+        self.ctx_channels = ctx_channels if ctx_channels is not None else channels
+        self.num_heads = num_heads
+        self._type = type
+        self.attn_mode = attn_mode
+        self.window_size = window_size
+        self.shift_window = shift_window
+        self.use_rope = use_rope
+        self.qk_rms_norm = qk_rms_norm
+        if self._type == "self":
+            self.to_qkv = nn.Linear(channels, channels * 3, bias=qkv_bias)
+        else:
+            self.to_q = nn.Linear(channels, channels, bias=qkv_bias)
+            self.to_kv = nn.Linear(self.ctx_channels, channels * 2, bias=qkv_bias)
+        if self.qk_rms_norm:
+            self.q_rms_norm = MultiHeadRMSNorm(self.head_dim, num_heads)
+            self.k_rms_norm = MultiHeadRMSNorm(self.head_dim, num_heads)
+        self.to_out = nn.Linear(channels, channels)
+        if use_rope:
+            self.rope = RotaryPositionEmbedder(channels)
+    def forward(self, x: torch.Tensor, context: Optional[torch.Tensor] = None, indices: Optional[torch.Tensor] = None) -> torch.Tensor:
+        B, L, C = x.shape
+        if self._type == "self":
+            qkv = self.to_qkv(x)
+            qkv = qkv.reshape(B, L, 3, self.num_heads, -1)
+            if self.use_rope:
+                q, k, v = qkv.unbind(dim=2)
+                q, k = self.rope(q, k, indices)
+                qkv = torch.stack([q, k, v], dim=2)
+            if self.attn_mode == "full":
+                if self.qk_rms_norm:
+                    q, k, v = qkv.unbind(dim=2)
+                    q = self.q_rms_norm(q)
+                    k = self.k_rms_norm(k)
+                    h = scaled_dot_product_attention(q, k, v)
+                else:
+                    h = scaled_dot_product_attention(qkv)
+            elif self.attn_mode == "windowed":
+                raise NotImplementedError("Windowed attention is not yet implemented")
+        else:
+            Lkv = context.shape[1]
+            q = self.to_q(x)
+            kv = self.to_kv(context)
+            q = q.reshape(B, L, self.num_heads, -1)
+            kv = kv.reshape(B, Lkv, 2, self.num_heads, -1)
+            if self.qk_rms_norm:
+                q = self.q_rms_norm(q)
+                k, v = kv.unbind(dim=2)
+                k = self.k_rms_norm(k)
+                h = scaled_dot_product_attention(q, k, v)
+            else:
+                h = scaled_dot_product_attention(q, kv)
+        h = h.reshape(B, L, -1)
+        h = self.to_out(h)
+        return h
+class ProjectAttention(nn.Module):
+    def __init__(self,cross_attn_block: nn.Module):
+        super().__init__()
+        self.cross_attn_block = cross_attn_block
+        self.global_token_length = 5
+    def forward(self, x: torch.Tensor, context: Optional[torch.Tensor] = None) -> torch.Tensor:
+        global_context = context[0]
+        proj_context = context[1]
+        global_context = self.cross_attn_block(x, global_context)
+        context = proj_context + global_context
+        return context + x

pixal3d/modules/norm.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+import torch.nn as nn
+class LayerNorm32(nn.LayerNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class GroupNorm32(nn.GroupNorm):
+    """
+    A GroupNorm layer that converts to float32 before the forward pass.
+    """
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class ChannelLayerNorm32(LayerNorm32):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        DIM = x.dim()
+        x = x.permute(0, *range(2, DIM), 1).contiguous()
+        x = super().forward(x)
+        x = x.permute(0, DIM-1, *range(1, DIM-1)).contiguous()
+        return x

pixal3d/modules/sparse/__init__.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from typing import *
+BACKEND = 'torchsparse'
+DEBUG = False
+ATTN = 'flash_attn'
+def __from_env():
+    import os
+    global BACKEND
+    global DEBUG
+    global ATTN
+    env_sparse_backend = os.environ.get('SPARSE_BACKEND')
+    env_sparse_debug = os.environ.get('SPARSE_DEBUG')
+    env_sparse_attn = os.environ.get('SPARSE_ATTN_BACKEND')
+    if env_sparse_attn is None:
+        env_sparse_attn = os.environ.get('ATTN_BACKEND')
+    if env_sparse_backend is not None and env_sparse_backend in ['spconv', 'torchsparse']:
+        BACKEND = env_sparse_backend
+    if env_sparse_debug is not None:
+        DEBUG = env_sparse_debug == '1'
+    if env_sparse_attn is not None and env_sparse_attn in ['xformers', 'flash_attn']:
+        ATTN = env_sparse_attn
+    print(f"[SPARSE] Backend: {BACKEND}, Attention: {ATTN}")
+__from_env()
+def set_backend(backend: Literal['spconv', 'torchsparse']):
+    global BACKEND
+    BACKEND = backend
+def set_debug(debug: bool):
+    global DEBUG
+    DEBUG = debug
+def set_attn(attn: Literal['xformers', 'flash_attn']):
+    global ATTN
+    ATTN = attn
+import importlib
+__attributes = {
+    'SparseTensor': 'basic',
+    'sparse_batch_broadcast': 'basic',
+    'sparse_batch_op': 'basic',
+    'sparse_cat': 'basic',
+    'sparse_unbind': 'basic',
+    'SparseGroupNorm': 'norm',
+    'SparseLayerNorm': 'norm',
+    'SparseGroupNorm32': 'norm',
+    'SparseLayerNorm32': 'norm',
+    'SparseSigmoid': 'nonlinearity',
+    'SparseReLU': 'nonlinearity',
+    'SparseSiLU': 'nonlinearity',
+    'SparseGELU': 'nonlinearity',
+    'SparseTanh': 'nonlinearity',
+    'SparseActivation': 'nonlinearity',
+    'SparseLinear': 'linear',
+    'sparse_scaled_dot_product_attention': 'attention',
+    'SerializeMode': 'attention',
+    'sparse_serialized_scaled_dot_product_self_attention': 'attention',
+    'sparse_windowed_scaled_dot_product_self_attention': 'attention',
+    'SparseMultiHeadAttention': 'attention',
+    'SparseConv3d': 'conv',
+    'SparseInverseConv3d': 'conv',
+    'sparseconv3d_func': 'conv',
+    'SparseDownsample': 'spatial',
+    'SparseUpsample': 'spatial',
+    'SparseSubdivide' : 'spatial'
+}
+__submodules = ['transformer']
+__all__ = list(__attributes.keys()) + __submodules
+def __getattr__(name):
+    if name not in globals():
+        if name in __attributes:
+            module_name = __attributes[name]
+            module = importlib.import_module(f".{module_name}", __name__)
+            globals()[name] = getattr(module, name)
+        elif name in __submodules:
+            module = importlib.import_module(f".{name}", __name__)
+            globals()[name] = module
+        else:
+            raise AttributeError(f"module {__name__} has no attribute {name}")
+    return globals()[name]
+# For Pylance
+if __name__ == '__main__':
+    from .basic import *
+    from .norm import *
+    from .nonlinearity import *
+    from .linear import *
+    from .attention import *
+    from .conv import *
+    from .spatial import *
+    import transformer

pixal3d/modules/sparse/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (2.64 kB). View file

pixal3d/modules/sparse/__pycache__/basic.cpython-310.pyc ADDED Viewed

Binary file (15.2 kB). View file

pixal3d/modules/sparse/__pycache__/linear.cpython-310.pyc ADDED Viewed

Binary file (884 Bytes). View file

pixal3d/modules/sparse/__pycache__/nonlinearity.cpython-310.pyc ADDED Viewed

Binary file (2.17 kB). View file

pixal3d/modules/sparse/__pycache__/norm.cpython-310.pyc ADDED Viewed

Binary file (2.7 kB). View file

pixal3d/modules/sparse/__pycache__/spatial.cpython-310.pyc ADDED Viewed

Binary file (5.11 kB). View file

pixal3d/modules/sparse/attention/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .full_attn import *
+from .serialized_attn import *
+from .windowed_attn import *
+from .modules import *
+from .spatial_sparse_attention.module.spatial_sparse_attention import SpatialSparseAttention

pixal3d/modules/sparse/attention/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (391 Bytes). View file

pixal3d/modules/sparse/attention/__pycache__/full_attn.cpython-310.pyc ADDED Viewed

Binary file (7.3 kB). View file

pixal3d/modules/sparse/attention/__pycache__/modules.cpython-310.pyc ADDED Viewed

Binary file (5.86 kB). View file