Spaces:

HorizonRobotics
/

3D-Fixer

Running on Zero

App Files Files Community

JasonYinnnn commited on 20 days ago

Commit

afea36f

1 Parent(s): 3d533e5

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +408 -0
.gitmodules +3 -0
README.md +3 -3
app.py +857 -4
requirements.txt +33 -0
scripts/grounding_sam.py +371 -0
scripts/grounding_sam2.py +353 -0
threeDFixer/__init__.py +11 -0
threeDFixer/datasets/__init__.py +107 -0
threeDFixer/datasets/utils.py +631 -0
threeDFixer/models/__init__.py +123 -0
threeDFixer/models/scene_sparse_structure_flow.py +334 -0
threeDFixer/models/scene_structured_latent_flow.py +415 -0
threeDFixer/models/sparse_elastic_mixin.py +29 -0
threeDFixer/models/sparse_structure_flow.py +219 -0
threeDFixer/models/sparse_structure_vae.py +325 -0
threeDFixer/models/structured_latent_flow.py +295 -0
threeDFixer/models/structured_latent_vae/__init__.py +9 -0
threeDFixer/models/structured_latent_vae/base.py +122 -0
threeDFixer/models/structured_latent_vae/decoder_gs.py +150 -0
threeDFixer/models/structured_latent_vae/decoder_mesh.py +189 -0
threeDFixer/models/structured_latent_vae/decoder_rf.py +118 -0
threeDFixer/models/structured_latent_vae/encoder.py +93 -0
threeDFixer/modules/attention/__init__.py +41 -0
threeDFixer/modules/attention/full_attn.py +145 -0
threeDFixer/modules/attention/modules.py +151 -0
threeDFixer/modules/norm.py +30 -0
threeDFixer/modules/sparse/__init__.py +102 -0
threeDFixer/modules/sparse/attention/__init__.py +9 -0
threeDFixer/modules/sparse/attention/full_attn.py +220 -0
threeDFixer/modules/sparse/attention/modules.py +144 -0
threeDFixer/modules/sparse/attention/serialized_attn.py +198 -0
threeDFixer/modules/sparse/attention/windowed_attn.py +140 -0
threeDFixer/modules/sparse/basic.py +464 -0
threeDFixer/modules/sparse/conv/__init__.py +26 -0
threeDFixer/modules/sparse/conv/conv_spconv.py +85 -0
threeDFixer/modules/sparse/conv/conv_torchsparse.py +43 -0
threeDFixer/modules/sparse/linear.py +20 -0
threeDFixer/modules/sparse/nonlinearity.py +40 -0
threeDFixer/modules/sparse/norm.py +63 -0
threeDFixer/modules/sparse/spatial.py +115 -0
threeDFixer/modules/sparse/transformer/__init__.py +7 -0
threeDFixer/modules/sparse/transformer/blocks.py +156 -0
threeDFixer/modules/sparse/transformer/modulated.py +304 -0
threeDFixer/modules/spatial.py +53 -0
threeDFixer/modules/transformer/__init__.py +2 -0
threeDFixer/modules/transformer/blocks.py +187 -0
threeDFixer/modules/transformer/modulated.py +289 -0
threeDFixer/modules/utils.py +59 -0
threeDFixer/moge/__init__.py +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,408 @@

+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+# Mono auto generated files
+mono_crash.*
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+# StyleCop
+StyleCopReport.xml
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.tlog
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+# Chutzpah Test files
+_Chutzpah*
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+# Visual Studio Trace Files
+*.e2e
+# TFS 2012 Local Workspace
+$tf/
+# Guidance Automation Toolkit
+*.gpState
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+# TeamCity is a build add-in
+_TeamCity*
+# DotCover is a Code Coverage Tool
+*.dotCover
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+# Web workbench (sass)
+.sass-cache/
+# Installshield output folder
+[Ee]xpress/
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+# Click-Once directory
+publish/
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+# Microsoft Azure Emulator
+ecf/
+rcf/
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+# RIA/Silverlight projects
+Generated_Code/
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+# Microsoft Fakes
+FakesAssemblies/
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+# Visual Studio 6 build log
+*.plg
+# Visual Studio 6 workspace options file
+*.opt
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+# Visual Studio 6 auto-generated project file (contains which files were open etc.)
+*.vbp
+# Visual Studio 6 workspace and project file (working project files containing files to include in project)
+*.dsw
+*.dsp
+# Visual Studio 6 technical files
+*.ncb
+*.aps
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+# FAKE - F# Make
+.fake/
+# CodeRush personal settings
+.cr/personal
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+# Tabs Studio
+*.tss
+# Telerik's JustMock configuration file
+*.jmconfig
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+# OpenCover UI analysis results
+OpenCover/
+# Azure Stream Analytics local run output
+ASALocalRun/
+# MSBuild Binary and Structured Log
+*.binlog
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+# Local History for Visual Studio
+.localhistory/
+# Visual Studio History (VSHistory) files
+.vshistory/
+# BeatPulse healthcheck temp database
+healthchecksdb
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
+# VS Code files for those working on multiple tools
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+# Local History for Visual Studio Code
+.history/
+# Windows Installer files from build outputs
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# JetBrains Rider
+*.sln.iml
+threeDFixer_weights
+threeDFixer_weights/**
+tmp
+tmp/**
+gradio_temp
+gradio_temp/**

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "threeDFixer/representations/mesh/flexicubes"]
+	path = threeDFixer/representations/mesh/flexicubes
+	url = https://github.com/MaxtirError/FlexiCubes.git

README.md CHANGED Viewed

@@ -4,12 +4,12 @@ emoji: 🦀
 colorFrom: blue
 colorTo: indigo
 sdk: gradio
-sdk_version: 6.10.0
-python_version: '3.12'
 app_file: app.py
 pinned: false
 license: apache-2.0
 short_description: Create 3D Scene from a single image via In-Place Completion.
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: blue
 colorTo: indigo
 sdk: gradio
+sdk_version: 4.44.1
+python_version: '3.10'
 app_file: app.py
 pinned: false
 license: apache-2.0
 short_description: Create 3D Scene from a single image via In-Place Completion.
 ---
+This is the interactive demo of [3D-Fixer](https://zx-yin.github.io/3dfixer/).

app.py CHANGED Viewed

@@ -1,7 +1,860 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+# SPDX-FileCopyrightText: 2026 Ze-Xin Yin, Robot labs of Horizon Robotics, and D-Robotics
+# SPDX-License-Identifier: Apache-2.0
+# See the LICENSE file in the project root for full license information.
+import os
+os.environ["GRADIO_TEMP_DIR"] = os.path.join(os.getcwd(), "gradio_temp")
+os.makedirs(os.environ["GRADIO_TEMP_DIR"], exist_ok=True)
+import uuid
+from typing import Any, List, Optional, Union
+import cv2
+import torch
+import numpy as np
+from PIL import Image
+import trimesh
+import random
+import imageio
+from einops import repeat
+from gradio_image_prompter import ImagePrompter
 import gradio as gr
+from threeDFixer.pipelines import ThreeDFixerPipeline
+from threeDFixer.datasets.utils import (
+    edge_mask_morph_gradient,
+    process_scene_image,
+    process_instance_image,
+    transform_vertices,
+    normalize_vertices,
+    project2ply
+)
+from threeDFixer.utils import render_utils, postprocessing_utils
+from scripts.grounding_sam2 import plot_segmentation, segment
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+import copy
+import shutil
+import time
+from concurrent.futures import ThreadPoolExecutor
+MARKDOWN = """
+## Image to 3D Scene with [3D-Fixer](https://zx-yin.github.io/3dfixer/)
+1. Upload an image, and draw bounding boxes for each instance by holding and dragging the mouse. Then click "Run Segmentation" to generate the segmentation result.
+2. If you find the generated 3D scene satisfactory, download it by clicking the "Download scene GLB" button, and you can also download each islolated 3D instance.
+3. In this implementation, we generate each instances one by one, and update the scene results at the "Generated GLB" area, besides, we display isolated instances below.
+4. it may take a while for the first time inference due to the usage of ```torch.compile```.
+"""
+MAX_SEED = np.iinfo(np.int32).max
+TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp")
+EXAMPLE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets/example_data")
+DTYPE = torch.float16
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+VALID_RATIO_THRESHOLD = 0.005
+CROP_SIZE = 518
+work_space = None
+dpt_pack = None
+generated_object_map = {}
+# Prepare models
+## Grounding SAM
+sam2_checkpoint = "./checkpoints/sam2-hiera-large/sam2_hiera_large.pt"
+sam2_model_cfg = "configs/sam2/sam2_hiera_l.yaml"
+sam2_predictor = SAM2ImagePredictor(
+    build_sam2(sam2_model_cfg, sam2_checkpoint),
+)
+############## 3D-Fixer model
+model_dir = 'HorizonRobotics/3D-Fixer'
+pipeline = ThreeDFixerPipeline.from_pretrained(
+    model_dir, compile=True
+)
+pipeline.cuda()
+############## 3D-Fixer model
+rot = np.array([
+    [-1.0,  0.0,  0.0, 0.0],
+    [ 0.0,  0.0,  1.0, 0.0],
+    [ 0.0,  1.0,  0.0, 0.0],
+    [ 0.0,  0.0,  0.0, 1.0],
+], dtype=np.float32)
+c2w = torch.tensor([
+    [1.0, 0.0, 0.0, 0.0],
+    [0.0, 0.0, -1.0, 0.0],
+    [0.0, 1.0, 0.0, 0.0],
+    [0.0, 0.0, 0.0, 1.0],
+], dtype=torch.float32, device=DEVICE)
+save_projected_colored_pcd = lambda pts, pts_color, fpath: trimesh.PointCloud(pts.reshape(-1, 3), pts_color.reshape(-1, 3)).export(fpath)
+EXAMPLES = [
+    [
+        {
+            "image": "assets/example_data/scene1/rgb.png",
+        },
+        "assets/example_data/scene1/seg.png",
+        1024,
+        False,
+        25, 5.5, 0.8, 1.0, 5.0
+        # num_inference_steps, guidance_scale, cfg_interval_start, cfg_interval_end, t_rescale
+    ],
+    [
+        {
+            "image": "assets/example_data/scene2/rgb.png",
+        },
+        "assets/example_data/scene2/seg.png",
+        1,
+        False,
+        25, 5.0, 0.8, 1.0, 5.0
+    ],
+    [
+        {
+            "image": "assets/example_data/scene3/rgb.png",
+        },
+        "assets/example_data/scene3/seg.png",
+        1,
+        False,
+        25, 5.0, 0.8, 1.0, 5.0
+    ],
+    [
+        {
+            "image": "assets/example_data/scene4/rgb.png",
+        },
+        "assets/example_data/scene4/seg.png",
+        42,
+        False,
+        25, 5.0, 0.8, 1.0, 5.0
+    ],
+    [
+        {
+            "image": "assets/example_data/scene5/rgb.png",
+        },
+        "assets/example_data/scene5/seg.png",
+        1,
+        False,
+        25, 5.0, 0.8, 1.0, 5.0
+    ],
+    [
+        {
+            "image": "assets/example_data/scene6/rgb.png",
+        },
+        "assets/example_data/scene6/seg.png",
+        1,
+        False,
+        25, 5.0, 0.8, 1.0, 5.0
+    ]
+]
+def cleanup_tmp(tmp_root: str = "./tmp", expire_seconds: int = 3600) -> None:
+    """
+    删除 tmp_root 下超过 expire_seconds 未更新的旧子目录。
+    Args:
+        tmp_root: 临时目录根路径。
+        expire_seconds: 过期时间，默认 3600 秒（1 小时）。
+    """
+    tmp_root = os.path.abspath(tmp_root)
+    if not os.path.isdir(tmp_root):
+        return
+    now = time.time()
+    for name in os.listdir(tmp_root):
+        path = os.path.join(tmp_root, name)
+        # 只清理子目录，不动散落文件
+        if not os.path.isdir(path):
+            continue
+        try:
+            mtime = os.path.getmtime(path)
+            age = now - mtime
+            if age > expire_seconds:
+                shutil.rmtree(path, ignore_errors=False)
+                print(f"[cleanup_tmp] removed old directory: {path}")
+        except Exception as e:
+            print(f"[cleanup_tmp] failed to remove {path}: {e}")
+@torch.no_grad()
+def run_segmentation(
+    image_prompts: Any,
+    polygon_refinement: bool = True,
+) -> Image.Image:
+    rgb_image = image_prompts["image"].convert("RGB")
+    global work_space
+    # pre-process the layers and get the xyxy boxes of each layer
+    if len(image_prompts["points"]) == 0:
+        gr.Error("No points provided for segmentation. Please add points to the image.")
+        return None
+    boxes = [
+        [
+            [int(box[0]), int(box[1]), int(box[3]), int(box[4])]
+            for box in image_prompts["points"]
+        ]
+    ]
+    detections = segment(
+        sam2_predictor,
+        rgb_image,
+        boxes=[boxes],
+        polygon_refinement=polygon_refinement,
+    )
+    seg_map_pil = plot_segmentation(rgb_image, detections)
+    torch.cuda.empty_cache()
+    cleanup_tmp(TMP_DIR, expire_seconds=3600)
+    work_space = os.path.join(TMP_DIR, f"work_space_{uuid.uuid4()}")
+    os.makedirs(work_space, exist_ok=True)
+    seg_map_pil.save(os.path.join(work_space, 'mask.png'))
+    return seg_map_pil
+@torch.no_grad()
+def run_depth_estimation(
+    image_prompts: Any,
+    seg_image: Union[str, Image.Image],
+) -> Image.Image:
+    rgb_image = image_prompts["image"].convert("RGB")
+    rgb_image = rgb_image.resize((1024, 1024), Image.Resampling.LANCZOS)
+    global dpt_pack
+    global work_space
+    if work_space is None:
+        work_space = os.path.join(TMP_DIR, f"work_space_{uuid.uuid4()}")
+        os.makedirs(work_space, exist_ok=True)
+    global generated_object_map
+    generated_object_map = {}
+    origin_W, origin_H = rgb_image.size
+    if max(origin_H, origin_W) > 1024:
+        factor = max(origin_H, origin_W) / 1024
+        H = int(origin_H // factor)
+        W = int(origin_W // factor)
+        rgb_image = rgb_image.resize((W, H), Image.Resampling.LANCZOS)
+    W, H = rgb_image.size
+    input_image = np.array(rgb_image).astype(np.float32)
+    input_image = torch.tensor(input_image / 255, dtype=torch.float32, device=DEVICE).permute(2, 0, 1)
+    output = pipeline.models['scene_cond_model'].infer(input_image)
+    depth = output['depth']
+    intrinsics = output['intrinsics']
+    invalid_mask = torch.logical_or(torch.isnan(depth), torch.isinf(depth))
+    depth_mask = ~invalid_mask
+    depth = torch.where(invalid_mask, 0.0, depth)
+    K = torch.from_numpy(
+        np.array([
+            [intrinsics[0, 0].item() * W, 0, 0.5*W],
+            [0, intrinsics[1, 1].item() * H, 0.5*H],
+            [0, 0, 1]
+        ])
+    ).to(dtype=torch.float32, device=DEVICE)
+    dpt_pack = {
+        'c2w': c2w,
+        'K': K,
+        'depth_mask': depth_mask,
+        'depth': depth
+    }
+    instance_labels = np.unique(np.array(seg_image).reshape(-1, 3), axis=0)
+    seg_image = seg_image.resize((W, H), Image.Resampling.LANCZOS)
+    seg_image = np.array(seg_image)
+    mask_pack = []
+    for instance_label in instance_labels:
+        if (instance_label == np.array([0, 0, 0])).all():
+            continue
+        else:
+            instance_mask = (seg_image.reshape(-1, 3) == instance_label).all(axis=-1).reshape(H, W)
+            mask_pack.append(instance_mask)
+    fg_mask = torch.from_numpy(np.stack(mask_pack).any(axis=0)).to(DEVICE)
+    scene_est_depth_pts, scene_est_depth_pts_colors = \
+        project2ply(depth_mask, depth, input_image, K, c2w)
+    save_ply_path = os.path.join(work_space, "scene_pcd.glb")
+    fg_depth_pts, _ = \
+        project2ply(fg_mask, depth, input_image, K, c2w)
+    _, trans, scale = normalize_vertices(fg_depth_pts)
+    if trans.shape[0] == 1:
+        trans = trans[0]
+    dpt_pack.update(
+        {
+            "trans": trans,
+            "scale": scale,
+        }
+    )
+    trimesh.PointCloud(scene_est_depth_pts.reshape(-1, 3), scene_est_depth_pts_colors.reshape(-1, 3)).\
+        apply_translation(-trans).apply_scale(1. / (scale + 1e-6)).\
+        apply_transform(rot).export(save_ply_path)
+    torch.cuda.empty_cache()
+    return save_ply_path
+def save_image(img, save_path):
+    img = (img.permute(1, 2, 0).detach().cpu().numpy() * 255.).astype(np.uint8)
+    imageio.v3.imwrite(save_path, img)
+def set_random_seed(seed):
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def export_single_glb_from_outputs(
+    outputs,
+    fine_scale,
+    fine_trans,
+    coarse_scale,
+    coarse_trans,
+    trans,
+    scale,
+    rot,
+    work_space,
+    instance_name,
+    run_id
+):
+    with torch.enable_grad():
+        glb = postprocessing_utils.to_glb(
+            outputs["gaussian"][0],
+            outputs["mesh"][0],
+            simplify=0.95,
+            texture_size=1024,
+            transform_fn=lambda x: transform_vertices(
+                x,
+                ops=["scale", "translation", "scale", "translation"],
+                params=[fine_scale, fine_trans[None], coarse_scale, coarse_trans[None]],
+            ),
+            verbose=False
+        )
+    instance_glb_path = os.path.abspath(
+        os.path.join(work_space, f"{run_id}_{instance_name}.glb")
+    )
+    glb.apply_translation(-trans) \
+       .apply_scale(1.0 / (scale + 1e-6)) \
+       .apply_transform(rot) \
+       .export(instance_glb_path)
+    return instance_glb_path, glb
+def export_scene_glb(trimeshes, work_space, scene_name):
+    scene_path = os.path.abspath(os.path.join(work_space, scene_name))
+    trimesh.Scene(trimeshes).export(scene_path)
+    return scene_path
+@torch.no_grad()
+def run_generation(
+    rgb_image: Any,
+    seg_image: Union[str, Image.Image],
+    seed: int,
+    randomize_seed: bool = False,
+    num_inference_steps: int = 50,
+    guidance_scale: float = 5.0,
+    cfg_interval_start: float = 0.5,
+    cfg_interval_end: float = 1.0,
+    t_rescale: float = 3.0,
+):
+    global dpt_pack
+    global work_space
+    global generated_object_map
+    generated_object_map = {}
+    run_id = str(uuid.uuid4())
+    if not isinstance(rgb_image, Image.Image) and "image" in rgb_image:
+        rgb_image = rgb_image["image"]
+    instance_labels = np.unique(np.array(seg_image).reshape(-1, 3), axis=0)
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    set_random_seed(seed)
+    H, W = dpt_pack['depth_mask'].shape
+    rgb_image = rgb_image.resize((W, H), Image.Resampling.LANCZOS)
+    seg_image = seg_image.resize((W, H), Image.Resampling.LANCZOS)
+    depth_mask = dpt_pack['depth_mask'].detach().cpu().numpy() > 0
+    seg_image = np.array(seg_image)
+    mask_pack = []
+    for instance_label in instance_labels:
+        if (instance_label == np.array([0, 0, 0])).all():
+            continue
+        instance_mask = (seg_image.reshape(-1, 3) == instance_label).all(axis=-1).reshape(H, W)
+        mask_pack.append(instance_mask)
+    erode_kernel_size = 7
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (erode_kernel_size, erode_kernel_size))
+    results = []
+    trimeshes = []
+    trans = dpt_pack['trans']
+    scale = dpt_pack['scale']
+    current_scene_path = None
+    pending_exports = []
+    def build_stream_html(status_text: str):
+        cards_html = "".join([
+            f"""
+            <div style="
+                width: 220px;
+                border: 1px solid #ddd;
+                border-radius: 10px;
+                padding: 8px;
+                background: white;
+                box-sizing: border-box;
+            ">
+                <div style="font-weight: 600; margin-bottom: 6px;">
+                    {item["name"]}
+                </div>
+                <video
+                    autoplay
+                    muted
+                    loop
+                    playsinline
+                    preload="metadata"
+                    poster="/file={item['poster_path']}?v={run_id}"
+                    style="
+                        width: 100%;
+                        border-radius: 8px;
+                        display: block;
+                        background: #f5f5f5;
+                    "
+                >
+                    <source src="/file={item['mp4_path']}?v={run_id}" type="video/mp4">
+                </video>
+                <div style="
+                    margin-top: 6px;
+                    font-size: 12px;
+                    color: #666;
+                ">
+                    Status: {item.get("status_text", "Unknown")}
+                </div>
+                <div style="
+                    margin-top: 4px;
+                    font-size: 13px;
+                    color: #444;
+                    word-break: break-all;
+                ">
+                    {os.path.basename(item["glb_path"]) if item["glb_path"] is not None else "GLB not ready yet"}
+                </div>
+            </div>
+            """
+            for item in results
+        ])
+        return f"""
+        <div style="padding: 8px 0;">
+            <div style="font-weight: 700; margin-bottom: 8px;">Status: {status_text}</div>
+            <div style="font-weight: 700; margin-bottom: 12px;">Generated objects: {len(results)}</div>
+            <div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: flex-start;">
+                {cards_html}
+            </div>
+        </div>
+        """
+    def build_selector_and_download_updates(default_latest: bool = True):
+        object_choices = [item["name"] for item in results if item["glb_path"] is not None]
+        if len(object_choices) == 0:
+            return (
+                gr.update(choices=[], value=None),
+                gr.update(value=None, interactive=False),
+            )
+        selected_value = object_choices[-1] if default_latest else object_choices[0]
+        selected_path = generated_object_map[selected_value]
+        return (
+            gr.update(choices=object_choices, value=selected_value),
+            gr.update(value=selected_path, interactive=True),
+        )
+    def flush_finished_exports(status_text: str):
+        nonlocal current_scene_path, trimeshes, pending_exports
+        any_update = False
+        finished_items = []
+        for item in pending_exports:
+            if item["future"].done():
+                finished_items.append(item)
+        for item in finished_items:
+            pending_exports.remove(item)
+            result_index = item["result_index"]
+            object_label = item["object_label"]
+            future = item["future"]
+            try:
+                instance_glb_path, glb = future.result()
+            except Exception as e:
+                print(f"[export_glb][error] instance={item['instance_name']}: {e}")
+                results[result_index]["status_text"] = "GLB export failed"
+                any_update = True
+                continue
+            results[result_index]["glb_path"] = instance_glb_path
+            results[result_index]["status_text"] = "GLB ready"
+            generated_object_map[object_label] = instance_glb_path
+            trimeshes.append(glb)
+            current_scene_path = export_scene_glb(
+                trimeshes=trimeshes,
+                work_space=work_space,
+                scene_name=f"{run_id}_scene_step_{len(trimeshes)}.glb",
+            )
+            any_update = True
+        if any_update:
+            selector_update, single_download_update = build_selector_and_download_updates(default_latest=True)
+            return (
+                current_scene_path,
+                build_stream_html(status_text),
+                gr.update(value=current_scene_path, interactive=(current_scene_path is not None)),
+                selector_update,
+                single_download_update,
+            )
+        return None
+    yield (
+        None,
+        build_stream_html("Generating..."),
+        gr.update(value=None, interactive=False),
+        gr.update(choices=[], value=None),
+        gr.update(value=None, interactive=False),
+    )
+    with ThreadPoolExecutor(max_workers=1) as executor:
+        for instance_name, object_mask in enumerate(mask_pack):
+            try:
+                flushed = flush_finished_exports("Generating...")
+                if flushed is not None:
+                    yield flushed
+                est_depth = dpt_pack['depth'].to('cpu')
+                c2w = dpt_pack['c2w'].to('cpu')
+                K = dpt_pack['K'].to('cpu')
+                intrinsics = dpt_pack['K'].float().to(DEVICE)
+                extrinsics = copy.deepcopy(dpt_pack['c2w']).float().to(DEVICE)
+                extrinsics[:3, 1:3] *= -1
+                object_mask = object_mask > 0
+                instance_mask = np.logical_and(object_mask, depth_mask).astype(np.uint8)
+                valid_ratio = np.sum((instance_mask > 0).astype(np.float32)) / (H * W)
+                print(f'valid ratio of {instance_name}: {valid_ratio:.4f}')
+                if valid_ratio < VALID_RATIO_THRESHOLD:
+                    continue
+                edge_mask = edge_mask_morph_gradient(instance_mask, kernel, 3)
+                fg_mask = (instance_mask > edge_mask).astype(np.uint8)
+                color_mask = fg_mask.astype(np.float32) + edge_mask.astype(np.float32) * 0.5
+                image = rgb_image
+                scene_image, scene_image_masked = process_scene_image(image, instance_mask, CROP_SIZE)
+                instance_image, instance_mask, instance_rays_o, instance_rays_d, instance_rays_c, \
+                    instance_rays_t = process_instance_image(image, instance_mask, color_mask, est_depth, K, c2w, CROP_SIZE)
+                save_image(scene_image, os.path.join(work_space, f'input_scene_image_{instance_name}.png'))
+                save_image(scene_image_masked, os.path.join(work_space, f'input_scene_image_masked_{instance_name}.png'))
+                save_image(instance_image, os.path.join(work_space, f'input_instance_image_{instance_name}.png'))
+                save_image(
+                    torch.cat([instance_image, instance_mask]),
+                    os.path.join(work_space, f'input_instance_image_masked_{instance_name}.png')
+                )
+                pcd_points = (
+                    instance_rays_o.to(DEVICE) +
+                    instance_rays_d.to(DEVICE) * instance_rays_t[..., None].to(DEVICE)
+                ).detach().cpu().numpy()
+                pcd_colors = instance_rays_c
+                save_projected_colored_pcd(
+                    pcd_points,
+                    repeat(pcd_colors, 'n -> n c', c=3),
+                    f"{work_space}/instance_est_depth_{instance_name}.ply"
+                )
+                outputs, coarse_trans, coarse_scale, fine_trans, fine_scale = pipeline.run(
+                    torch.cat([instance_image, instance_mask]).to(DEVICE),
+                    scene_image_masked=scene_image_masked.to(DEVICE),
+                    seed=seed,
+                    extrinsics=extrinsics.to(DEVICE),
+                    intrinsics=intrinsics.to(DEVICE),
+                    points=pcd_points,
+                    points_mask=pcd_colors,
+                    sparse_structure_sampler_params={
+                        "steps": num_inference_steps,
+                        "cfg_strength": guidance_scale,
+                        "cfg_interval": [cfg_interval_start, cfg_interval_end],
+                        "rescale_t": t_rescale
+                    },
+                    slat_sampler_params={
+                        "steps": num_inference_steps,
+                        "cfg_strength": guidance_scale,
+                        "cfg_interval": [cfg_interval_start, cfg_interval_end],
+                        "rescale_t": t_rescale
+                    }
+                )
+                mp4_path = os.path.abspath(
+                    os.path.join(work_space, f"{run_id}_instance_gs_fine_{instance_name}.mp4")
+                )
+                poster_path = os.path.abspath(
+                    os.path.join(work_space, f"{run_id}_instance_gs_fine_{instance_name}.png")
+                )
+                video = render_utils.render_video(
+                    outputs["gaussian"][0],
+                    bg_color=(1.0, 1.0, 1.0)
+                )["color"]
+                imageio.mimsave(mp4_path, video, fps=30)
+                imageio.imwrite(poster_path, video[0])
+                object_label = f"Object {len(results) + 1}"
+                result_index = len(results)
+                results.append({
+                    "name": object_label,
+                    "mp4_path": mp4_path,
+                    "poster_path": poster_path,
+                    "glb_path": None,
+                    "instance_index": instance_name,
+                    "status_text": "Exporting GLB...",
+                })
+                # 第一次更新：视频先出来，3D 场景保持当前不变
+                yield (
+                    current_scene_path,
+                    build_stream_html("Generating..."),
+                    gr.update(value=current_scene_path, interactive=(current_scene_path is not None)),
+                    gr.update(choices=[], value=None),
+                    gr.update(value=None, interactive=False),
+                )
+                future = executor.submit(
+                    export_single_glb_from_outputs,
+                    outputs=outputs,
+                    fine_scale=fine_scale,
+                    fine_trans=fine_trans,
+                    coarse_scale=coarse_scale,
+                    coarse_trans=coarse_trans,
+                    trans=trans,
+                    scale=scale,
+                    rot=rot,
+                    work_space=work_space,
+                    instance_name=instance_name,
+                    run_id=run_id,
+                )
+                pending_exports.append({
+                    "future": future,
+                    "result_index": result_index,
+                    "instance_name": instance_name,
+                    "object_label": object_label,
+                })
+                flushed = flush_finished_exports("Generating...")
+                if flushed is not None:
+                    yield flushed
+            except Exception as e:
+                print(e)
+        while len(pending_exports) > 0:
+            flushed = flush_finished_exports("Generating...")
+            if flushed is not None:
+                yield flushed
+            else:
+                time.sleep(0.2)
+    ready_items = [item for item in results if item["glb_path"] is not None]
+    if len(ready_items) > 0:
+        final_scene_path = export_scene_glb(
+            trimeshes=trimeshes,
+            work_space=work_space,
+            scene_name=f"{run_id}_scene_final.glb",
+        )
+        selector_update, single_download_update = build_selector_and_download_updates(default_latest=True)
+        yield (
+            final_scene_path,
+            build_stream_html("Finished"),
+            gr.update(value=final_scene_path, interactive=True),
+            selector_update,
+            single_download_update,
+        )
+    else:
+        yield (
+            None,
+            "<div style='padding: 8px 0;'><b>Status:</b> No valid object generated.</div>",
+            gr.update(value=None, interactive=False),
+            gr.update(choices=[], value=None),
+            gr.update(value=None, interactive=False),
+        )
+def update_single_download(selected_name):
+    global generated_object_map
+    if selected_name is None or selected_name not in generated_object_map:
+        return gr.update(value=None, interactive=False)
+    return gr.update(value=generated_object_map[selected_name], interactive=True)
+# Demo
+with gr.Blocks() as demo:
+    gr.Markdown(MARKDOWN)
+    with gr.Column():
+        with gr.Row():
+            image_prompts = ImagePrompter(label="Input Image", type="pil")
+            seg_image = gr.Image(
+                label="Segmentation Result", type="pil", format="png"
+            )
+            with gr.Column():
+                with gr.Accordion("Segmentation Settings", open=True):
+                    polygon_refinement = gr.Checkbox(label="Polygon Refinement", value=False)
+                seg_button = gr.Button("Run Segmentation (step 1)")
+                dpt_button = gr.Button("Run Depth estimation (step 2)", variant="primary")
+        with gr.Row():
+            dpt_model_output = gr.Model3D(label="Estimated depth map", interactive=False)
+            model_output = gr.Model3D(label="Generated GLB", interactive=False)
+            with gr.Column():
+                with gr.Accordion("Generation Settings", open=True):
+                    seed = gr.Slider(
+                        label="Seed",
+                        minimum=0,
+                        maximum=MAX_SEED,
+                        step=1,
+                        value=42,
+                    )
+                    randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
+                    num_inference_steps = gr.Slider(
+                        label="Number of inference steps",
+                        minimum=1,
+                        maximum=50,
+                        step=1,
+                        value=25,
+                    )
+                    with gr.Row():
+                        cfg_interval_start = gr.Slider(
+                            label="CFG interval start",
+                            minimum=0.0,
+                            maximum=1.0,
+                            step=0.01,
+                            value=0.8,
+                        )
+                        cfg_interval_end = gr.Slider(
+                            label="CFG interval end",
+                            minimum=0.0,
+                            maximum=1.0,
+                            step=0.01,
+                            value=1.0,
+                        )
+                        t_rescale = gr.Slider(
+                            label="t rescale factor",
+                            minimum=1.0,
+                            maximum=5.0,
+                            step=0.1,
+                            value=5.0,
+                        )
+                    guidance_scale = gr.Slider(
+                        label="CFG scale",
+                        minimum=0.0,
+                        maximum=10.0,
+                        step=0.1,
+                        value=5.0,
+                    )
+                gen_button = gr.Button("Run Generation (step 3)", variant="primary", interactive=False)
+                download_glb = gr.DownloadButton(label="Download scene GLB", interactive=False)
+                with gr.Row():
+                    object_selector = gr.Dropdown(label="Choose instance: ")
+                    download_single_glb = gr.DownloadButton(label="Download single GLB", interactive=False)
+        stream_output = gr.HTML(label="Generated Objects Stream")
+        with gr.Row():
+            gr.Examples(
+                examples=EXAMPLES,
+                fn=run_generation,
+                inputs=[image_prompts, seg_image, seed, randomize_seed, num_inference_steps, guidance_scale, cfg_interval_start, cfg_interval_end, t_rescale],
+                outputs=[model_output, download_glb, seed],
+                cache_examples=False,
+            )
+    seg_button.click(
+        run_segmentation,
+        inputs=[
+            image_prompts,
+            polygon_refinement,
+        ],
+        outputs=[seg_image],
+    ).then(lambda: gr.Button(interactive=True), outputs=[dpt_button])
+    dpt_button.click(
+        run_depth_estimation,
+        inputs=[
+            image_prompts,
+            seg_image
+        ],
+        outputs=[dpt_model_output],
+    ).then(lambda: gr.Button(interactive=True), outputs=[gen_button])
+    gen_button.click(
+        run_generation,
+        inputs=[
+            image_prompts,
+            seg_image,
+            seed,
+            randomize_seed,
+            num_inference_steps,
+            guidance_scale,
+            cfg_interval_start,
+            cfg_interval_end,
+            t_rescale
+        ],
+        outputs=[model_output,
+                 stream_output,
+                 download_glb,
+                 object_selector,
+                 download_single_glb],
+    )
+    object_selector.change(
+        update_single_download,
+        inputs=[object_selector],
+        outputs=[download_single_glb],
+    )
+demo.launch(allowed_paths=[TMP_DIR, EXAMPLE_DIR])

requirements.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+packaging
+wheel
+pybind11
+ninja
+Cython
+torch==2.4.0+cu118
+torchvision==0.19.0+cu118
+pillow
+imageio
+imageio-ffmpeg
+tqdm
+easydict
+opencv-python-headless
+scipy
+rembg
+onnxruntime
+trimesh
+open3d
+xatlas
+pyvista
+pymeshfix
+igraph
+transformers
+icecream
+plyfile
+pycocotools
+shapely
+git+https://github.com/EasternJournalist/utils3d.git@9a4eb15e4021b67b12c460c7057d642626897ec8
+flash-attn
+kaolin==0.17.0
+spconv-cu118
+gradio==4.44.1
+gradio_image_prompter

scripts/grounding_sam.py ADDED Viewed

	@@ -0,0 +1,371 @@

+# Copied from https://github.com/VAST-AI-Research/MIDI-3D
+# Original license: Apache-2.0 license
+# Copyright (c) the MIDI-3D authors
+import argparse
+import os
+import random
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import cv2
+import numpy as np
+import requests
+import torch
+from PIL import Image
+from transformers import AutoModelForMaskGeneration, AutoProcessor, pipeline
+def create_palette():
+    # Define a palette with 24 colors for labels 0-23 (example colors)
+    palette = [
+        0,
+        0,
+        0,  # Label 0 (black)
+        255,
+        0,
+        0,  # Label 1 (red)
+        0,
+        255,
+        0,  # Label 2 (green)
+        0,
+        0,
+        255,  # Label 3 (blue)
+        255,
+        255,
+        0,  # Label 4 (yellow)
+        255,
+        0,
+        255,  # Label 5 (magenta)
+        0,
+        255,
+        255,  # Label 6 (cyan)
+        128,
+        0,
+        0,  # Label 7 (dark red)
+        0,
+        128,
+        0,  # Label 8 (dark green)
+        0,
+        0,
+        128,  # Label 9 (dark blue)
+        128,
+        128,
+        0,  # Label 10
+        128,
+        0,
+        128,  # Label 11
+        0,
+        128,
+        128,  # Label 12
+        64,
+        0,
+        0,  # Label 13
+        0,
+        64,
+        0,  # Label 14
+        0,
+        0,
+        64,  # Label 15
+        64,
+        64,
+        0,  # Label 16
+        64,
+        0,
+        64,  # Label 17
+        0,
+        64,
+        64,  # Label 18
+        192,
+        192,
+        192,  # Label 19 (light gray)
+        128,
+        128,
+        128,  # Label 20 (gray)
+        255,
+        165,
+        0,  # Label 21 (orange)
+        75,
+        0,
+        130,  # Label 22 (indigo)
+        238,
+        130,
+        238,  # Label 23 (violet)
+    ]
+    # Extend the palette to have 768 values (256 * 3)
+    palette.extend([0] * (768 - len(palette)))
+    return palette
+PALETTE = create_palette()
+# Result Utils
+@dataclass
+class BoundingBox:
+    xmin: int
+    ymin: int
+    xmax: int
+    ymax: int
+    @property
+    def xyxy(self) -> List[float]:
+        return [self.xmin, self.ymin, self.xmax, self.ymax]
+@dataclass
+class DetectionResult:
+    score: Optional[float] = None
+    label: Optional[str] = None
+    box: Optional[BoundingBox] = None
+    mask: Optional[np.array] = None
+    @classmethod
+    def from_dict(cls, detection_dict: Dict) -> "DetectionResult":
+        return cls(
+            score=detection_dict["score"],
+            label=detection_dict["label"],
+            box=BoundingBox(
+                xmin=detection_dict["box"]["xmin"],
+                ymin=detection_dict["box"]["ymin"],
+                xmax=detection_dict["box"]["xmax"],
+                ymax=detection_dict["box"]["ymax"],
+            ),
+        )
+# Utils
+def mask_to_polygon(mask: np.ndarray) -> List[List[int]]:
+    # Find contours in the binary mask
+    contours, _ = cv2.findContours(
+        mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+    )
+    # Find the contour with the largest area
+    largest_contour = max(contours, key=cv2.contourArea)
+    # Extract the vertices of the contour
+    polygon = largest_contour.reshape(-1, 2).tolist()
+    return polygon
+def polygon_to_mask(
+    polygon: List[Tuple[int, int]], image_shape: Tuple[int, int]
+) -> np.ndarray:
+    """
+    Convert a polygon to a segmentation mask.
+    Args:
+    - polygon (list): List of (x, y) coordinates representing the vertices of the polygon.
+    - image_shape (tuple): Shape of the image (height, width) for the mask.
+    Returns:
+    - np.ndarray: Segmentation mask with the polygon filled.
+    """
+    # Create an empty mask
+    mask = np.zeros(image_shape, dtype=np.uint8)
+    # Convert polygon to an array of points
+    pts = np.array(polygon, dtype=np.int32)
+    # Fill the polygon with white color (255)
+    cv2.fillPoly(mask, [pts], color=(255,))
+    return mask
+def load_image(image_str: str) -> Image.Image:
+    if image_str.startswith("http"):
+        image = Image.open(requests.get(image_str, stream=True).raw).convert("RGB")
+    else:
+        image = Image.open(image_str).convert("RGB")
+    return image
+def get_boxes(results: DetectionResult) -> List[List[List[float]]]:
+    boxes = []
+    for result in results:
+        xyxy = result.box.xyxy
+        boxes.append(xyxy)
+    return [boxes]
+def refine_masks(
+    masks: torch.BoolTensor, polygon_refinement: bool = False
+) -> List[np.ndarray]:
+    masks = masks.cpu().float()
+    masks = masks.permute(0, 2, 3, 1)
+    masks = masks.mean(axis=-1)
+    masks = (masks > 0).int()
+    masks = masks.numpy().astype(np.uint8)
+    masks = list(masks)
+    if polygon_refinement:
+        for idx, mask in enumerate(masks):
+            shape = mask.shape
+            polygon = mask_to_polygon(mask)
+            mask = polygon_to_mask(polygon, shape)
+            masks[idx] = mask
+    return masks
+# Post-processing Utils
+def generate_colored_segmentation(label_image):
+    # Create a PIL Image from the label image (assuming it's a 2D numpy array)
+    label_image_pil = Image.fromarray(label_image.astype(np.uint8), mode="P")
+    # Apply the palette to the image
+    palette = create_palette()
+    label_image_pil.putpalette(palette)
+    return label_image_pil
+def plot_segmentation(image, detections):
+    seg_map = np.zeros(image.size[::-1], dtype=np.uint8)
+    for i, detection in enumerate(detections):
+        mask = detection.mask
+        seg_map[mask > 0] = i + 1
+    seg_map_pil = generate_colored_segmentation(seg_map)
+    return seg_map_pil
+# Grounded SAM
+def prepare_model(
+    device: str = "cuda",
+    detector_id: Optional[str] = None,
+    segmenter_id: Optional[str] = None,
+):
+    detector_id = (
+        detector_id if detector_id is not None else "IDEA-Research/grounding-dino-tiny"
+    )
+    object_detector = pipeline(
+        model=detector_id, task="zero-shot-object-detection", device=device
+    )
+    segmenter_id = segmenter_id if segmenter_id is not None else "facebook/sam-vit-base"
+    processor = AutoProcessor.from_pretrained(segmenter_id)
+    segmentator = AutoModelForMaskGeneration.from_pretrained(segmenter_id).to(device)
+    return object_detector, processor, segmentator
+def detect(
+    object_detector: Any,
+    image: Image.Image,
+    labels: List[str],
+    threshold: float = 0.3,
+) -> List[Dict[str, Any]]:
+    """
+    Use Grounding DINO to detect a set of labels in an image in a zero-shot fashion.
+    """
+    labels = [label if label.endswith(".") else label + "." for label in labels]
+    results = object_detector(image, candidate_labels=labels, threshold=threshold)
+    results = [DetectionResult.from_dict(result) for result in results]
+    return results
+def segment(
+    processor: Any,
+    segmentator: Any,
+    image: Image.Image,
+    boxes: Optional[List[List[List[float]]]] = None,
+    detection_results: Optional[List[Dict[str, Any]]] = None,
+    polygon_refinement: bool = False,
+) -> List[DetectionResult]:
+    """
+    Use Segment Anything (SAM) to generate masks given an image + a set of bounding boxes.
+    """
+    if detection_results is None and boxes is None:
+        raise ValueError(
+            "Either detection_results or detection_boxes must be provided."
+        )
+    if boxes is None:
+        boxes = get_boxes(detection_results)
+    inputs = processor(images=image, input_boxes=boxes, return_tensors="pt").to(
+        segmentator.device, segmentator.dtype
+    )
+    outputs = segmentator(**inputs)
+    masks = processor.post_process_masks(
+        masks=outputs.pred_masks,
+        original_sizes=inputs.original_sizes,
+        reshaped_input_sizes=inputs.reshaped_input_sizes,
+    )[0]
+    masks = refine_masks(masks, polygon_refinement)
+    if detection_results is None:
+        detection_results = [DetectionResult() for _ in masks]
+    for detection_result, mask in zip(detection_results, masks):
+        detection_result.mask = mask
+    return detection_results
+def grounded_segmentation(
+    object_detector,
+    processor,
+    segmentator,
+    image: Union[Image.Image, str],
+    labels: Union[str, List[str]],
+    threshold: float = 0.3,
+    polygon_refinement: bool = False,
+) -> Tuple[np.ndarray, List[DetectionResult], Image.Image]:
+    if isinstance(image, str):
+        image = load_image(image)
+    if isinstance(labels, str):
+        labels = labels.split(",")
+    detections = detect(object_detector, image, labels, threshold)
+    detections = segment(
+        processor,
+        segmentator,
+        image,
+        detection_results=detections,
+        polygon_refinement=polygon_refinement,
+    )
+    seg_map_pil = plot_segmentation(image, detections)
+    return np.array(image), detections, seg_map_pil
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--image", type=str, required=True)
+    parser.add_argument("--labels", type=str, nargs="+", required=True)
+    parser.add_argument("--output", type=str, default="./", help="Output directory")
+    parser.add_argument("--threshold", type=float, default=0.3)
+    parser.add_argument(
+        "--detector_id", type=str, default="IDEA-Research/grounding-dino-base"
+    )
+    parser.add_argument("--segmenter_id", type=str, default="facebook/sam-vit-base")
+    args = parser.parse_args()
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    object_detector, processor, segmentator = prepare_model(
+        device=device, detector_id=args.detector_id, segmenter_id=args.segmenter_id
+    )
+    image_array, detections, seg_map_pil = grounded_segmentation(
+        object_detector,
+        processor,
+        segmentator,
+        image=args.image,
+        labels=args.labels,
+        threshold=args.threshold,
+        polygon_refinement=True,
+    )
+    os.makedirs(args.output, exist_ok=True)
+    seg_map_pil.save(os.path.join(args.output, "segmentation.png"))

scripts/grounding_sam2.py ADDED Viewed

	@@ -0,0 +1,353 @@

+# Copied from https://github.com/Mengmouxu/SceneGen
+# Original license: MIT license
+# Copyright (c) the SceneGen authors
+import argparse
+import os
+import random
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import cv2
+import numpy as np
+import requests
+import torch
+from PIL import Image
+from transformers import AutoModelForMaskGeneration, AutoProcessor, pipeline
+from contextlib import nullcontext
+def create_palette():
+    # Define a palette with 24 colors for labels 0-23 (example colors)
+    palette = [
+        0,
+        0,
+        0,  # Label 0 (black)
+        255,
+        0,
+        0,  # Label 1 (red)
+        0,
+        255,
+        0,  # Label 2 (green)
+        0,
+        0,
+        255,  # Label 3 (blue)
+        255,
+        255,
+        0,  # Label 4 (yellow)
+        255,
+        0,
+        255,  # Label 5 (magenta)
+        0,
+        255,
+        255,  # Label 6 (cyan)
+        128,
+        0,
+        0,  # Label 7 (dark red)
+        0,
+        128,
+        0,  # Label 8 (dark green)
+        0,
+        0,
+        128,  # Label 9 (dark blue)
+        128,
+        128,
+        0,  # Label 10
+        128,
+        0,
+        128,  # Label 11
+        0,
+        128,
+        128,  # Label 12
+        64,
+        0,
+        0,  # Label 13
+        0,
+        64,
+        0,  # Label 14
+        0,
+        0,
+        64,  # Label 15
+        64,
+        64,
+        0,  # Label 16
+        64,
+        0,
+        64,  # Label 17
+        0,
+        64,
+        64,  # Label 18
+        192,
+        192,
+        192,  # Label 19 (light gray)
+        128,
+        128,
+        128,  # Label 20 (gray)
+        255,
+        165,
+        0,  # Label 21 (orange)
+        75,
+        0,
+        130,  # Label 22 (indigo)
+        238,
+        130,
+        238,  # Label 23 (violet)
+    ]
+    # Extend the palette to have 768 values (256 * 3)
+    palette.extend([0] * (768 - len(palette)))
+    return palette
+PALETTE = create_palette()
+# Result Utils
+@dataclass
+class BoundingBox:
+    xmin: int
+    ymin: int
+    xmax: int
+    ymax: int
+    @property
+    def xyxy(self) -> List[float]:
+        return [self.xmin, self.ymin, self.xmax, self.ymax]
+@dataclass
+class DetectionResult:
+    score: Optional[float] = None
+    label: Optional[str] = None
+    box: Optional[BoundingBox] = None
+    mask: Optional[np.array] = None
+    @classmethod
+    def from_dict(cls, detection_dict: Dict) -> "DetectionResult":
+        return cls(
+            score=detection_dict["score"],
+            label=detection_dict["label"],
+            box=BoundingBox(
+                xmin=detection_dict["box"]["xmin"],
+                ymin=detection_dict["box"]["ymin"],
+                xmax=detection_dict["box"]["xmax"],
+                ymax=detection_dict["box"]["ymax"],
+            ),
+        )
+# Utils
+def mask_to_polygon(mask: np.ndarray) -> List[List[int]]:
+    # Find contours in the binary mask
+    contours, _ = cv2.findContours(
+        mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+    )
+    # Find the contour with the largest area
+    largest_contour = max(contours, key=cv2.contourArea)
+    # Extract the vertices of the contour
+    polygon = largest_contour.reshape(-1, 2).tolist()
+    return polygon
+def polygon_to_mask(
+    polygon: List[Tuple[int, int]], image_shape: Tuple[int, int]
+) -> np.ndarray:
+    """
+    Convert a polygon to a segmentation mask.
+    Args:
+    - polygon (list): List of (x, y) coordinates representing the vertices of the polygon.
+    - image_shape (tuple): Shape of the image (height, width) for the mask.
+    Returns:
+    - np.ndarray: Segmentation mask with the polygon filled.
+    """
+    # Create an empty mask
+    mask = np.zeros(image_shape, dtype=np.uint8)
+    # Convert polygon to an array of points
+    pts = np.array(polygon, dtype=np.int32)
+    # Fill the polygon with white color (255)
+    cv2.fillPoly(mask, [pts], color=(255,))
+    return mask
+def load_image(image_str: str) -> Image.Image:
+    if image_str.startswith("http"):
+        image = Image.open(requests.get(image_str, stream=True).raw).convert("RGB")
+    else:
+        image = Image.open(image_str).convert("RGB")
+    return image
+def get_boxes(results: DetectionResult) -> List[List[List[float]]]:
+    boxes = []
+    for result in results:
+        xyxy = result.box.xyxy
+        boxes.append(xyxy)
+    return [boxes]
+def refine_masks(
+    masks: torch.BoolTensor, polygon_refinement: bool = False
+) -> List[np.ndarray]:
+    masks = masks.cpu().float()
+    masks = masks.permute(0, 2, 3, 1)
+    masks = masks.mean(axis=-1)
+    masks = (masks > 0).int()
+    masks = masks.numpy().astype(np.uint8)
+    masks = list(masks)
+    if polygon_refinement:
+        for idx, mask in enumerate(masks):
+            shape = mask.shape
+            polygon = mask_to_polygon(mask)
+            mask = polygon_to_mask(polygon, shape)
+            masks[idx] = mask
+    return masks
+# Post-processing Utils
+def generate_colored_segmentation(label_image):
+    # Create a PIL Image from the label image (assuming it's a 2D numpy array)
+    label_image_pil = Image.fromarray(label_image.astype(np.uint8), mode="P")
+    # Apply the palette to the image
+    palette = create_palette()
+    label_image_pil.putpalette(palette)
+    return label_image_pil
+def plot_segmentation(image, detections):
+    seg_map = np.zeros(image.size[::-1], dtype=np.uint8)
+    for i, detection in enumerate(detections):
+        mask = detection.mask
+        seg_map[mask > 0] = i + 1
+    seg_map_pil = generate_colored_segmentation(seg_map)
+    return seg_map_pil
+# Grounded SAM
+def prepare_model(
+    device: str = "cuda",
+    detector_id: Optional[str] = None,
+    segmenter_id: Optional[str] = None,
+):
+    detector_id = (
+        detector_id if detector_id is not None else "IDEA-Research/grounding-dino-tiny"
+    )
+    object_detector = pipeline(
+        model=detector_id, task="zero-shot-object-detection", device=device
+    )
+    segmenter_id = segmenter_id if segmenter_id is not None else "facebook/sam-vit-base"
+    processor = AutoProcessor.from_pretrained(segmenter_id)
+    segmentator = AutoModelForMaskGeneration.from_pretrained(segmenter_id).to(device)
+    return object_detector, processor, segmentator
+def detect(
+    object_detector: Any,
+    image: Image.Image,
+    labels: List[str],
+    threshold: float = 0.3,
+) -> List[Dict[str, Any]]:
+    """
+    Use Grounding DINO to detect a set of labels in an image in a zero-shot fashion.
+    """
+    labels = [label if label.endswith(".") else label + "." for label in labels]
+    results = object_detector(image, candidate_labels=labels, threshold=threshold)
+    results = [DetectionResult.from_dict(result) for result in results]
+    return results
+def segment(
+    predictor: Any,
+    image: Image.Image,
+    boxes: Optional[List[List[List[float]]]] = None,
+    detection_results: Optional[List[Dict[str, Any]]] = None,
+    polygon_refinement: bool = False,
+) -> List[DetectionResult]:
+    """
+    Use SAM2 predictor to generate masks given an image + a set of bounding boxes.
+    """
+    if detection_results is None and boxes is None:
+        raise ValueError("Either detection_results or detection_boxes must be provided.")
+    # Build boxes from detections if not provided
+    if boxes is None:
+        boxes = get_boxes(detection_results)
+    # Flatten potential [[...], ...] -> [...]
+    if isinstance(boxes, list) and len(boxes) == 1 and isinstance(boxes[0], list):
+        boxes = boxes[0]
+    # Ensure image is a numpy RGB array (H, W, 3)
+    if isinstance(image, Image.Image):
+        np_image = np.array(image.convert("RGB"))
+    else:
+        np_image = np.array(image)
+    # Resolve device
+    device = getattr(predictor, "device", None)
+    if device is None:
+        model = getattr(predictor, "model", None)
+        if model is not None:
+            device = next(model.parameters()).device
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Prepare autocast context only for CUDA
+    amp_ctx = torch.autocast("cuda", dtype=torch.bfloat16) if device.type == "cuda" else nullcontext()
+    # Run predictor
+    with torch.inference_mode():
+        with amp_ctx:
+            predictor.set_image(np_image)
+            # Boxes to tensor
+            boxes_t = torch.tensor(boxes, dtype=torch.float32, device=device)
+            # Transform boxes if predictor exposes a transform like SAM/SAM2
+            if hasattr(predictor, "transform") and hasattr(predictor.transform, "apply_boxes_torch"):
+                boxes_in = predictor.transform.apply_boxes_torch(boxes_t, np_image.shape[:2])
+            else:
+                boxes_in = boxes_t
+            # Predict masks for boxes; request single mask per box
+            masks, scores, _ = predictor.predict(
+                box=boxes_in,
+                multimask_output=False
+            )
+    # Normalize masks to numpy [N, H, W] boolean
+    if isinstance(masks, torch.Tensor):
+        masks_np = masks.detach().cpu().numpy()
+    else:
+        masks_np = np.asarray(masks)
+    if masks_np.ndim == 4 and masks_np.shape[1] == 1:
+        masks_np = masks_np[:, 0]  # [N, 1, H, W] -> [N, H, W]
+    masks_np = (masks_np > 0).astype(np.uint8)
+    # Reuse refine_masks to optionally polygon-refine
+    masks_torch = torch.from_numpy(masks_np).unsqueeze(1).to(torch.bool)  # [N,1,H,W]
+    masks_list = refine_masks(masks_torch, polygon_refinement)
+    if detection_results is None:
+        detection_results = [DetectionResult() for _ in masks_list]
+    for detection_result, mask in zip(detection_results, masks_list):
+        detection_result.mask = mask
+    return detection_results

threeDFixer/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from . import models
+from . import modules
+from . import pipelines
+from . import renderers
+from . import representations
+from . import utils

threeDFixer/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# This file is modified from TRELLIS:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+# Modifications Copyright (c) 2026 Ze-Xin Yin, Robot labs of Horizon Robotics, and D-Robotics.
+import importlib
+__attributes = {
+    'SparseStructure': 'sparse_structure',
+    'SparseFeat2Render': 'sparse_feat2render',
+    'SLat2Render':'structured_latent2render',
+    'Slat2RenderGeo':'structured_latent2render',
+    'SparseStructureLatent': 'sparse_structure_latent',
+    'TextConditionedSparseStructureLatent': 'sparse_structure_latent',
+    'ImageConditionedSparseStructureLatent': 'sparse_structure_latent',
+    'SLat': 'structured_latent',
+    'TextConditionedSLat': 'structured_latent',
+    'ImageConditionedSLat': 'structured_latent',
+    'ImageConditionedSparseStructureLatentRandRot': 'sparse_structure_latent_random_rot',
+    'ImageConditionedSLatRandRot': 'structured_latent_random_rot',
+    'SparseFeat2RenderRandRot': 'sparse_feat2render_random_rot',
+    'Slat2RenderGeoRandRot': 'structured_latent2render_random_rot',
+    'ObjectImageConditionedSparseStructureVoxel': 'scene_sparse_structure_latent_obj_pretrain',
+    'SceneImageConditionedVoxel': 'scene_sparse_structure_latent',
+    'SceneConditionedSLat': 'scene_structured_latent',
+}
+__submodules = []
+__all__ = list(__attributes.keys()) + __submodules
+def __getattr__(name):
+    if name not in globals():
+        if name in __attributes:
+            module_name = __attributes[name]
+            module = importlib.import_module(f".{module_name}", __name__)
+            globals()[name] = getattr(module, name)
+        elif name in __submodules:
+            module = importlib.import_module(f".{name}", __name__)
+            globals()[name] = module
+        else:
+            raise AttributeError(f"module {__name__} has no attribute {name}")
+    return globals()[name]
+# For Pylance
+if __name__ == '__main__':
+    from .sparse_structure import SparseStructure
+    from .sparse_feat2render import SparseFeat2Render
+    from .structured_latent2render import (
+        SLat2Render,
+        Slat2RenderGeo,
+    )
+    from .sparse_structure_latent import (
+        SparseStructureLatent,
+        TextConditionedSparseStructureLatent,
+        ImageConditionedSparseStructureLatent,
+    )
+    from .structured_latent import (
+        SLat,
+        TextConditionedSLat,
+        ImageConditionedSLat,
+    )
+    # rot mesh
+    from .sparse_structure_latent_random_rot import (
+        ImageConditionedSparseStructureLatentRandRot
+    )
+    # rot SLAT
+    from .structured_latent_random_rot import (
+        ImageConditionedSLatRandRot
+    )
+    # VAE gs dec
+    from .sparse_feat2render_random_rot import (
+        SparseFeat2RenderRandRot
+    )
+    # VAE mesh dec
+    from .structured_latent2render_random_rot import (
+        Slat2RenderGeoRandRot
+    )
+    # object-level pre-training
+    from .scene_sparse_structure_latent_obj_pretrain import (
+        ObjectImageConditionedSparseStructureVoxel
+    )
+    # scene-level training dataloader for stage 1
+    from .scene_sparse_structure_latent import (
+        SceneImageConditionedVoxel
+    )
+    # scene-level training dataloader for stage 2
+    from .scene_structured_latent import (
+        SceneConditionedSLat
+    )

threeDFixer/datasets/utils.py ADDED Viewed

	@@ -0,0 +1,631 @@

+# SPDX-FileCopyrightText: 2026 Ze-Xin Yin, Robot labs of Horizon Robotics, and D-Robotics
+# SPDX-License-Identifier: Apache-2.0
+# See the LICENSE file in the project root for full license information.
+import os
+import json
+import cv2
+import torch
+from PIL import Image
+import imageio
+import numpy as np
+import open3d as o3d
+from einops import rearrange
+def voxelize_mesh(points, faces, clip_range_first=False, return_mask=True, resolution=64):
+    if clip_range_first:
+        points = np.clip(points, -0.5 + 1e-6, 0.5 - 1e-6)
+    mesh = o3d.geometry.TriangleMesh()
+    mesh.vertices = o3d.utility.Vector3dVector(points)
+    if isinstance(faces, o3d.cuda.pybind.utility.Vector3iVector):
+        mesh.triangles = faces
+    else:
+        mesh.triangles = o3d.cuda.pybind.utility.Vector3iVector(faces)
+    voxel_grid = o3d.geometry.VoxelGrid.create_from_triangle_mesh_within_bounds(mesh, voxel_size=1/64, min_bound=(-0.5, -0.5, -0.5), max_bound=(0.5, 0.5, 0.5))
+    vertices = np.array([voxel.grid_index for voxel in voxel_grid.get_voxels()])
+    assert np.all(vertices >= 0) and np.all(vertices < 64), "Some vertices are out of bounds"
+    vertices = (vertices + 0.5) / 64 - 0.5
+    coords = ((torch.tensor(vertices) + 0.5) * resolution).int().contiguous()
+    ss = torch.zeros(1, resolution, resolution, resolution, dtype=torch.long)
+    ss[:, coords[:, 0], coords[:, 1], coords[:, 2]] = 1
+    if return_mask:
+        ss_mask = rearrange(ss, 'c (x n1) (y n2) (z n3) -> (n1 n2 n3 c) x y z', n1=4, n2=4, n3=4).float()
+        return ss , ss_mask
+    else:
+        return ss
+def transform_vertices(vertices, ops, params):
+    for op, param in zip(ops, params):
+        if op == 'scale':
+            vertices = vertices * param
+        elif op == 'translation':
+            vertices = vertices + param
+        else:
+            raise NotImplementedError
+    return vertices
+def normalize_vertices(vertices, scale_factor=1.0):
+    min_pos, max_pos = np.min(vertices, axis=0), np.max(vertices, axis=0)
+    trans_pos = (min_pos + max_pos)[None] / 2.0
+    scale_pos = np.max(max_pos - min_pos) * scale_factor # 1: [-0.5, 0.5], 2.0: [-0.25, 0.25]
+    vertices = transform_vertices(vertices, ops=['translation', 'scale'],
+                                        params=[-trans_pos, 1.0 / (scale_pos + 1e-6)])
+    return vertices, trans_pos, scale_pos
+def renormalize_vertices(vertices, val_range=0.5, scale_factor=1.25):
+    min_pos, max_pos = np.min(vertices, axis=0), np.max(vertices, axis=0)
+    if (min_pos < -val_range).any() or (max_pos > val_range).any():
+        trans_pos = (min_pos + max_pos)[None] / 2.0
+        scale_pos = np.max(max_pos - min_pos) * scale_factor # 1: [-0.5, 0.5], 2.0: [-0.25, 0.25]
+        vertices = transform_vertices(vertices, ops=['translation', 'scale'],
+                                            params=[-trans_pos, 1.0 / (scale_pos + 1e-6)])
+    return vertices
+def rot_vertices(vertices, rot_angles, axis_list=['z']):
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(vertices)
+    for ang, axis in zip(rot_angles, axis_list):
+        if axis == 'x':
+            R = pcd.get_rotation_matrix_from_xyz((ang, 0, 0))
+            pcd.rotate(R, center=(0., 0., 0.))
+            del R
+        elif axis == 'y':
+            R = pcd.get_rotation_matrix_from_xyz((0, ang, 0))
+            pcd.rotate(R, center=(0., 0., 0.))
+            del R
+        elif axis == 'z':
+            R = pcd.get_rotation_matrix_from_xyz((0, 0, ang))
+            pcd.rotate(R, center=(0., 0., 0.))
+            del R
+        else:
+            raise NotImplementedError
+    rot_vertices = np.array(pcd.points)
+    del pcd
+    return rot_vertices
+def _rotmat_x(a: torch.Tensor) -> torch.Tensor:
+    # a: scalar tensor
+    ca, sa = torch.cos(a), torch.sin(a)
+    R = torch.stack([
+        torch.stack([torch.ones_like(a), torch.zeros_like(a), torch.zeros_like(a)]),
+        torch.stack([torch.zeros_like(a), ca, -sa]),
+        torch.stack([torch.zeros_like(a), sa, ca]),
+    ])
+    return R  # [3,3]
+def _rotmat_y(a: torch.Tensor) -> torch.Tensor:
+    ca, sa = torch.cos(a), torch.sin(a)
+    R = torch.stack([
+        torch.stack([ca, torch.zeros_like(a), sa]),
+        torch.stack([torch.zeros_like(a), torch.ones_like(a), torch.zeros_like(a)]),
+        torch.stack([-sa, torch.zeros_like(a), ca]),
+    ])
+    return R
+def _rotmat_z(a: torch.Tensor) -> torch.Tensor:
+    ca, sa = torch.cos(a), torch.sin(a)
+    R = torch.stack([
+        torch.stack([ca, -sa, torch.zeros_like(a)]),
+        torch.stack([sa, ca, torch.zeros_like(a)]),
+        torch.stack([torch.zeros_like(a), torch.zeros_like(a), torch.ones_like(a)]),
+    ])
+    return R
+def rot_vertices_torch(vertices, rot_angles, axis_list=('z',), center=(0.0, 0.0, 0.0)):
+    """
+    vertices: (N,3) numpy or torch
+    rot_angles: iterable of angles (radians), length matches axis_list
+    axis_list: iterable like ['x','y','z'] (applied in order)
+    center: rotation center, default origin (0,0,0), same as your Open3D code
+    return: torch.Tensor (N,3)
+    """
+    v = torch.as_tensor(vertices)
+    device, dtype = v.device, v.dtype
+    c = torch.tensor(center, device=device, dtype=dtype).view(1, 3)
+    v = v - c  # translate to center
+    # Compose rotations in the same order as your for-loop:
+    # Open3D effectively does v <- v @ R^T (for row-vector points).
+    for ang, axis in zip(rot_angles, axis_list):
+        a = torch.as_tensor(ang, device=device, dtype=dtype)
+        if axis == 'x':
+            R = _rotmat_x(a)
+        elif axis == 'y':
+            R = _rotmat_y(a)
+        elif axis == 'z':
+            R = _rotmat_z(a)
+        else:
+            raise NotImplementedError(f"Unknown axis {axis}")
+        v = v @ R.T  # match Open3D row-vector convention
+    v = v + c
+    return v
+def get_instance_mask(instance_mask_path):
+    index_mask = imageio.v3.imread(instance_mask_path)
+    index_mask = np.rint(index_mask.astype(np.float32) / 65535 * 100.0) # hand coded, max obj nums = 100
+    instance_list = np.unique(index_mask).astype(np.uint8)
+    return index_mask, instance_list
+def get_gt_depth(gt_depth_path, metadata):
+    gt_depth = imageio.v3.imread(gt_depth_path).astype(np.float32) / 65535.
+    depth_min, depth_max = metadata['depth']['min'], metadata['depth']['max']
+    gt_depth = gt_depth * (depth_max - depth_min) + depth_min
+    return torch.from_numpy(gt_depth).to(dtype=torch.float32)
+def get_est_depth(est_depth_path):
+    npz = np.load(est_depth_path)
+    est_depth = npz['depth']
+    est_depth_mask = npz['mask']
+    est_depth = torch.from_numpy(est_depth).to(dtype=torch.float32)
+    ivalid_mask = torch.logical_or(torch.isnan(est_depth), torch.isinf(est_depth))
+    est_depth_mask = np.logical_and(est_depth_mask, ~ivalid_mask.detach().cpu().numpy())
+    est_depth = torch.where(ivalid_mask, 0.0, est_depth)
+    return est_depth, est_depth_mask
+def get_mix_est_depth(est_depth_path, image_size):
+    if 'MoGe' in est_depth_path:
+        npz = np.load(est_depth_path)
+        est_depth = npz['depth']
+        est_depth_mask = npz['mask']
+        est_depth = torch.from_numpy(est_depth).to(dtype=torch.float32)
+        ivalid_mask = torch.logical_or(torch.isnan(est_depth), torch.isinf(est_depth))
+        est_depth_mask = np.logical_and(est_depth_mask, ~ivalid_mask.detach().cpu().numpy())
+        est_depth = torch.where(ivalid_mask, 0.0, est_depth)
+        return est_depth, est_depth_mask
+    elif 'DAv2_' in est_depth_path or 'ml-depth-pro' in est_depth_path:
+        npz = np.load(est_depth_path)
+        est_depth = npz['depth']
+        est_depth_mask = np.logical_not(np.logical_or(
+            np.isnan(est_depth),
+            np.isinf(est_depth),
+        ))
+        est_depth = torch.from_numpy(est_depth).to(dtype=torch.float32)
+        ivalid_mask = torch.logical_or(torch.isnan(est_depth), torch.isinf(est_depth))
+        est_depth_mask = np.logical_and(est_depth_mask, ~ivalid_mask.detach().cpu().numpy())
+        est_depth = torch.where(ivalid_mask, 0.0, est_depth)
+        return est_depth, est_depth_mask
+    elif 'VGGT_1B' in est_depth_path:
+        npz = np.load(est_depth_path)
+        est_depth = npz['depth']
+        est_depth_mask = npz['depth_conf'] > 2.0
+        valid_depth_mask = np.logical_not(np.logical_or(
+            np.isnan(est_depth),
+            np.isinf(est_depth),
+        ))
+        est_depth_mask = np.logical_and(
+            est_depth_mask,
+            valid_depth_mask
+        )
+        est_depth = np.where(valid_depth_mask, est_depth, 0.0)
+        depth_min, depth_max = np.min(est_depth), np.max(est_depth)
+        est_depth = (est_depth - depth_min) / (depth_max - depth_min + 1e-6)
+        est_depth = Image.fromarray(est_depth)
+        est_depth = est_depth.resize((image_size, image_size), Image.Resampling.NEAREST)
+        est_depth = torch.tensor(np.array(est_depth)).to(dtype=torch.float32)
+        est_depth = est_depth * (depth_max - depth_min) + depth_min
+        est_depth_mask = Image.fromarray(est_depth_mask.astype(np.float32))
+        est_depth_mask = est_depth_mask.resize((image_size, image_size), Image.Resampling.NEAREST)
+        est_depth_mask = np.array(est_depth_mask) > 0.5
+        ivalid_mask = torch.logical_or(torch.isnan(est_depth), torch.isinf(est_depth))
+        est_depth_mask = np.logical_and(est_depth_mask, ~ivalid_mask.detach().cpu().numpy())
+        est_depth = torch.where(ivalid_mask, 0.0, est_depth)
+        return est_depth, est_depth_mask
+def lstsq_align_depth(est_depth, gt_depth, mask):
+    valid_coords = torch.nonzero(mask)
+    if valid_coords.shape[0] > 0:
+        valid_gt_depth  = gt_depth[valid_coords[:, 0], valid_coords[:, 1]]
+        valid_est_depth = est_depth[valid_coords[:, 0], valid_coords[:, 1]]
+        X = torch.linalg.lstsq(valid_est_depth[None, :, None], valid_gt_depth[None, :, None]).solution
+        lstsq_scale = X.item()
+    else:
+        lstsq_scale = 1.0
+    return est_depth * lstsq_scale
+def get_cam_poses(frame_info, H, W):
+    camera_angle_x = float(frame_info['camera_angle_x'])
+    focal = .5 * W / np.tan(.5 * camera_angle_x)
+    K = np.array([
+        [focal, 0, 0.5*W],
+        [0, focal, 0.5*H],
+        [0, 0, 1]
+    ])
+    K = torch.from_numpy(K).float()
+    c2w = torch.from_numpy(np.array(frame_info['transform_matrix'])).float()
+    return K, c2w
+def edge_mask_morph_gradient(mask, kernel, iterations=1):
+    """
+    mask: HxW, bool/uint8
+    ksize: 3/5/7... 越大边缘越厚
+    return: edge_mask uint8 {0,1}
+    """
+    m = (mask.astype(np.uint8) > 0).astype(np.uint8)
+    dil = cv2.dilate(m, kernel, iterations=iterations, borderType=cv2.BORDER_CONSTANT, borderValue=0.0)
+    ero = cv2.erode(m, kernel, iterations=iterations, borderType=cv2.BORDER_CONSTANT, borderValue=0.0)
+    edge = (dil - ero)  # 0/1/2
+    edge = (edge > 0).astype(np.uint8)
+    return edge
+def process_scene_image(image: Image.Image, instance_mask: np.ndarray, image_size: int,
+                        resize_perturb: bool = False, resize_perturb_ratio: float = 0.0):
+    image_rgba = image
+    try:
+        alpha = np.array(image_rgba.getchannel("A")) > 0
+    except ValueError:
+        alpha = np.ones_like(np.array(image_rgba.getchannel(0))) > 0
+    alpha = np.logical_and(alpha, instance_mask).astype(np.uint8) * 255
+    image_resized = image_rgba.resize((image_size, image_size), Image.Resampling.LANCZOS).convert("RGB")
+    alpha_resized = Image.fromarray(alpha, mode="L").resize((image_size, image_size), Image.Resampling.NEAREST)
+    if resize_perturb and np.random.rand() < resize_perturb_ratio:
+        rand_reso = np.random.randint(32, image_size)
+        image_resized = image_resized.resize((rand_reso, rand_reso), Image.Resampling.LANCZOS)
+        image_resized = image_resized.resize((image_size, image_size), Image.Resampling.LANCZOS)
+        alpha_resized = alpha_resized.resize((rand_reso, rand_reso), Image.Resampling.NEAREST)
+        alpha_resized = alpha_resized.resize((image_size, image_size), Image.Resampling.NEAREST)
+    img_np = np.array(image_resized, dtype=np.uint8)
+    img_t = torch.from_numpy(img_np).permute(2, 0, 1).float() / 255.0
+    a_np = np.array(alpha_resized, dtype=np.uint8)
+    a_t = torch.from_numpy(a_np).unsqueeze(0).float() / 255.0
+    img4 = torch.cat([img_t, a_t], dim=0)  # (4,S,S)
+    return img_t, img4
+def get_rays(i, j, K, c2w):
+    i = i.float() + 0.5
+    j = j.float() + 0.5
+    dirs = torch.stack([(i-K[0][2])/K[0][0], -(j-K[1][2])/K[1][1], -torch.ones_like(i)], -1)
+    # Rotate ray directions from camera frame to the world frame
+    rays_d = torch.sum(dirs[..., np.newaxis, :] * c2w[:3,:3], -1)  # dot product, equals to: [c2w.dot(dir) for dir in dirs]
+    # Translate camera frame's origin to the world frame. It is the origin of all rays.
+    rays_o = c2w[:3,-1].expand(rays_d.shape)
+    return rays_o, rays_d
+def get_rays_fast(u: torch.Tensor, v: torch.Tensor, K: torch.Tensor, c2w: torch.Tensor):
+    """
+    u, v: 1D tensor (pixel coords), dtype long/int64 or int32
+    K: (3,3) or (4,4) but used as 3x3; on same device as output
+    c2w: (4,4) or (3,4), uses [:3,:3] and [:3,3]
+    return:
+      rays_o: (N,3)
+      rays_d: (N,3)
+    """
+    # 确保 float 并加 0.5 取像素中心
+    u = u.to(dtype=torch.float32) + 0.5
+    v = v.to(dtype=torch.float32) + 0.5
+    fx, fy = K[0, 0], K[1, 1]
+    cx, cy = K[0, 2], K[1, 2]
+    # dirs in camera frame (N,3)
+    dirs = torch.stack([(u - cx) / fx,
+                        -(v - cy) / fy,
+                        -torch.ones_like(u)], dim=-1)
+    # 旋转到世界坐标：dirs @ R^T (更常见/更快)
+    R = c2w[:3, :3]            # (3,3)
+    rays_d = dirs @ R.T        # (N,3)
+    # 原点：相机中心 (3,) 扩展到 (N,3)
+    t = c2w[:3, 3]
+    rays_o = t.expand_as(rays_d)
+    return rays_o, rays_d
+def process_instance_image(image: Image.Image, instance_mask: np.ndarray, color_mask: np.ndarray, depth_map: torch.Tensor,
+                           K: torch.Tensor, c2w: torch.Tensor, image_size: int):
+    image_rgba = image
+    try:
+        alpha = np.asarray(image_rgba.getchannel("A")) > 0
+    except ValueError:
+        alpha = np.ones_like(np.array(image_rgba.getchannel(0))) > 0
+    alpha = np.logical_and(alpha, instance_mask).astype(np.uint8) * 255
+    valid_mask = np.array(alpha).nonzero()
+    bbox = [valid_mask[1].min(), valid_mask[0].min(), valid_mask[1].max(), valid_mask[0].max()]
+    center = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
+    hsize = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2
+    aug_size_ratio = 1.2
+    aug_hsize = hsize * aug_size_ratio
+    aug_center_offset = [0, 0]
+    aug_center = [center[0] + aug_center_offset[0], center[1] + aug_center_offset[1]]
+    aug_bbox = [int(aug_center[0] - aug_hsize), int(aug_center[1] - aug_hsize), int(aug_center[0] + aug_hsize), int(aug_center[1] + aug_hsize)]
+    i, j = torch.from_numpy(valid_mask[1]), torch.from_numpy(valid_mask[0])
+    rays_o, rays_d = get_rays(i, j, K, c2w)
+    rays_color = color_mask[valid_mask[0], valid_mask[1]].astype(np.float32)
+    rays_t = depth_map[valid_mask[0], valid_mask[1]]
+    image_resized = image_rgba.crop(aug_bbox).convert("RGB").resize((image_size, image_size), Image.Resampling.LANCZOS)
+    alpha_resized = Image.fromarray(alpha, mode="L").crop(aug_bbox).resize((image_size, image_size), Image.Resampling.NEAREST)
+    img_np = np.asarray(image_resized, dtype=np.uint8)
+    img_t = torch.from_numpy(img_np).permute(2, 0, 1).float() / 255.0
+    a_np = np.asarray(alpha_resized, dtype=np.uint8)
+    a_t = torch.from_numpy(a_np).unsqueeze(0).float() / 255.0
+    return img_t, a_t, rays_o, rays_d, rays_color, rays_t
+def get_crop_area_rays(image: Image.Image, instance_mask: np.ndarray, K: torch.Tensor, c2w: torch.Tensor, image_size):
+    alpha = np.asarray(image.getchannel("A")) > 0
+    if instance_mask is not None:
+        alpha = np.logical_and(alpha, instance_mask).astype(np.float32) # * 255
+    else:
+        alpha = alpha.astype(np.float32)
+    valid_mask = np.array(alpha).nonzero()
+    bbox = [valid_mask[1].min(), valid_mask[0].min(), valid_mask[1].max(), valid_mask[0].max()]
+    center = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
+    hsize = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2
+    aug_size_ratio = 1.2
+    aug_hsize = hsize * aug_size_ratio
+    aug_center_offset = [0, 0]
+    aug_center = [center[0] + aug_center_offset[0], center[1] + aug_center_offset[1]]
+    aug_bbox = [int(aug_center[0] - aug_hsize), int(aug_center[1] - aug_hsize), int(aug_center[0] + aug_hsize), int(aug_center[1] + aug_hsize)]
+    i, j = torch.meshgrid(
+        torch.linspace(aug_bbox[0], aug_bbox[2]-1, steps=image_size),
+        torch.linspace(aug_bbox[1], aug_bbox[3]-1, steps=image_size)
+    )
+    rays_o, rays_d = get_rays(i, j, K, c2w)
+    return rays_o, rays_d
+def process_instance_image_crop(image: Image.Image, instance_mask: np.ndarray, color_mask: np.ndarray,
+                                depth_map: torch.Tensor,
+                                gt_depth_map: torch.Tensor,
+                           K: torch.Tensor, c2w: torch.Tensor, image_size: int,
+                           edge_mask_morph_gradient_fn):
+    image_rgba = image
+    alpha = np.asarray(image_rgba.getchannel("A")) > 0
+    alpha = np.logical_and(alpha, instance_mask).astype(np.float32) # * 255
+    valid_mask = np.array(alpha).nonzero()
+    bbox = [valid_mask[1].min(), valid_mask[0].min(), valid_mask[1].max(), valid_mask[0].max()]
+    center = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
+    hsize = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2
+    aug_size_ratio = 1.2
+    aug_hsize = hsize * aug_size_ratio
+    aug_center_offset = [0, 0]
+    aug_center = [center[0] + aug_center_offset[0], center[1] + aug_center_offset[1]]
+    aug_bbox = [int(aug_center[0] - aug_hsize), int(aug_center[1] - aug_hsize), int(aug_center[0] + aug_hsize), int(aug_center[1] + aug_hsize)]
+    i, j = torch.meshgrid(
+        torch.linspace(aug_bbox[0], aug_bbox[2]-1, steps=image_size),
+        torch.linspace(aug_bbox[1], aug_bbox[3]-1, steps=image_size)
+    )
+    rays_o, rays_d = get_rays(i, j, K, c2w)
+    image_resized = image_rgba.crop(aug_bbox).convert("RGB").resize((image_size, image_size), Image.Resampling.LANCZOS)
+    alpha_resized = Image.fromarray(alpha, mode="F").crop(aug_bbox).resize((image_size, image_size), Image.Resampling.NEAREST)
+    depth_map_resized = Image.fromarray(depth_map.detach().cpu().numpy(), mode="F").crop(aug_bbox).resize((image_size, image_size), Image.Resampling.NEAREST)
+    gt_depth_map_resized = Image.fromarray(gt_depth_map.detach().cpu().numpy(), mode="F").crop(aug_bbox).resize((image_size, image_size), Image.Resampling.NEAREST)
+    color_mask_resized = Image.fromarray(color_mask.astype(np.float32), mode="F").crop(aug_bbox).resize((image_size, image_size), Image.Resampling.NEAREST)
+    img_np = np.asarray(image_resized, dtype=np.uint8)
+    img_t = torch.from_numpy(img_np).permute(2, 0, 1).float() / 255.0
+    a_np = np.asarray(alpha_resized, dtype=np.float32).astype(dtype=np.uint8)
+    edge_mask = edge_mask_morph_gradient_fn((a_np > 0).astype(np.uint8))
+    fg_mask = (a_np > edge_mask).astype(np.uint8)
+    rays_color = fg_mask.astype(np.float32) + edge_mask.astype(np.float32) * 0.5
+    valid_mask = fg_mask.nonzero()
+    rays_t = torch.from_numpy(np.asarray(depth_map_resized).astype(np.float32))
+    a_t = torch.from_numpy(a_np).unsqueeze(0).float() # / 255.0
+    return img_t, a_t, fg_mask, rays_o, rays_d, rays_color, rays_t, valid_mask, depth_map_resized, gt_depth_map_resized, color_mask_resized
+def process_instance_image_only(image: Image.Image, instance_mask: np.ndarray, image_size: int):
+    image_rgba = image
+    alpha = np.asarray(image_rgba.getchannel("A")) > 0
+    alpha = np.logical_and(alpha, instance_mask).astype(np.uint8) * 255
+    valid_mask = np.array(alpha).nonzero()
+    bbox = [valid_mask[1].min(), valid_mask[0].min(), valid_mask[1].max(), valid_mask[0].max()]
+    center = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
+    hsize = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2
+    aug_size_ratio = 1.2
+    aug_hsize = hsize * aug_size_ratio
+    aug_center_offset = [0, 0]
+    aug_center = [center[0] + aug_center_offset[0], center[1] + aug_center_offset[1]]
+    aug_bbox = [int(aug_center[0] - aug_hsize), int(aug_center[1] - aug_hsize), int(aug_center[0] + aug_hsize), int(aug_center[1] + aug_hsize)]
+    image_resized = image_rgba.crop(aug_bbox).convert("RGB").resize((image_size, image_size), Image.Resampling.LANCZOS)
+    alpha_resized = Image.fromarray(alpha, mode="L").crop(aug_bbox).resize((image_size, image_size), Image.Resampling.NEAREST)
+    img_np = np.asarray(image_resized, dtype=np.uint8)
+    img_t = torch.from_numpy(img_np).permute(2, 0, 1).float() / 255.0
+    a_np = np.asarray(alpha_resized, dtype=np.uint8)
+    a_t = torch.from_numpy(a_np).unsqueeze(0).float() / 255.0
+    return img_t, a_t
+def crop_depth_image(depth_image, aug_bbox, image_size):
+    d = depth_image.cpu()
+    d_np = d.numpy().astype(np.float32)
+    img = Image.fromarray(d_np, mode="F")
+    img = img.crop(aug_bbox)
+    img = img.resize((image_size, image_size), Image.Resampling.NEAREST)
+    out = torch.from_numpy(np.asarray(img, dtype=np.float32))
+    return out
+def proj_depth2pcd(mask, depth, image, rays_o, rays_d):
+    mask = torch.nonzero(mask)
+    ###
+    mask = [mask[:, 0].detach().cpu().numpy(), mask[:, 1].detach().cpu().numpy()]
+    pixel_depth = depth[mask[0], mask[1]]
+    pixel_color = image.detach().permute(1, 2, 0)[mask[0], mask[1]]
+    pixel_points = rays_o[mask[0], mask[1]] + rays_d[mask[0], mask[1]] * pixel_depth[:, None] # pt
+    return pixel_points.detach().cpu().numpy(), pixel_color.detach().cpu().numpy()
+def vox2pts(ss, resolution = 64):
+    coords = torch.nonzero(ss[0] > 0, as_tuple=False)
+    position = (coords.float() + 0.5) / resolution - 0.5
+    position = position.detach().cpu().numpy()
+    return position
+def voxelize_pcd(points, points_color=None, clip_range_first=False, return_mask=True, resolution=64):
+    if clip_range_first:
+        points = np.clip(points, -0.5 + 1e-6, 0.5 - 1e-6)
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(points)
+    voxel_grid = o3d.geometry.VoxelGrid.create_from_point_cloud_within_bounds(pcd, voxel_size=1/resolution, min_bound=(-0.5, -0.5, -0.5), max_bound=(0.5, 0.5, 0.5))
+    vertices = np.array([voxel.grid_index for voxel in voxel_grid.get_voxels()])
+    assert np.all(vertices >= 0) and np.all(vertices < resolution), "Some vertices are out of bounds"
+    vertices = (vertices + 0.5) / resolution - 0.5
+    coords = ((torch.tensor(vertices) + 0.5) * resolution).int().contiguous()
+    ss = torch.zeros(1, resolution, resolution, resolution, dtype=torch.long)
+    ss[:, coords[:, 0], coords[:, 1], coords[:, 2]] = 1
+    if points_color is not None:
+        points_t = torch.from_numpy(points).to(torch.float32)
+        colors_t = torch.from_numpy(points_color).to(torch.float32)
+        coords = torch.floor((points_t + 0.5) * resolution).to(torch.long)
+        coords = torch.clamp(coords, 0, resolution - 1)
+        ix, iy, iz = coords[:, 0], coords[:, 1], coords[:, 2]
+        lin = ix * (resolution * resolution) + iy * resolution + iz # linear index in [0, R^3)
+        sum_color = torch.zeros((resolution * resolution * resolution), dtype=torch.float32)
+        sum_color.index_add_(0, lin, colors_t)
+        count = torch.zeros((resolution * resolution * resolution,), dtype=torch.long)
+        ones = torch.ones_like(lin, dtype=torch.long)
+        count.index_add_(0, lin, ones)
+        count_f = count.to(torch.float32)
+        mean_color = sum_color / torch.clamp(count_f, min=1.0)  # empty -> divide by 1 (still 0)
+        color_mean = mean_color.view(resolution, resolution, resolution, 1).permute(3, 0, 1, 2).contiguous()
+    if return_mask:
+        ss_mask = rearrange(ss if points_color is None else color_mean, 'c (x n1) (y n2) (z n3) -> (n1 n2 n3 c) x y z', n1=4, n2=4, n3=4).float()
+        return ss , ss_mask
+    else:
+        return ss
+def voxelize_pcd_pt(points, points_color=None, clip_range_first=False, return_mask=True, resolution=64):
+    points = torch.nan_to_num(points)
+    points_color = torch.nan_to_num(points_color) if isinstance(points_color, torch.Tensor) else points_color
+    device = points.device
+    if clip_range_first:
+        points = torch.clip(points, -0.5 + 1e-6, 0.5 - 1e-6)
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(points.detach().cpu().numpy())
+    voxel_grid = o3d.geometry.VoxelGrid.create_from_point_cloud_within_bounds(pcd, voxel_size=1/resolution, min_bound=(-0.5, -0.5, -0.5), max_bound=(0.5, 0.5, 0.5))
+    vertices = np.array([voxel.grid_index for voxel in voxel_grid.get_voxels()])
+    assert np.all(vertices >= 0) and np.all(vertices < resolution), "Some vertices are out of bounds"
+    vertices = (vertices + 0.5) / resolution - 0.5
+    coords = ((torch.tensor(vertices, device=device) + 0.5) * resolution).int().contiguous()
+    ss = torch.zeros(1, resolution, resolution, resolution, dtype=torch.long, device=device)
+    ss[:, coords[:, 0], coords[:, 1], coords[:, 2]] = 1
+    if points_color is not None:
+        points_t = points.to(torch.float32)
+        colors_t = points_color.to(torch.float32)
+        coords = torch.floor((points_t + 0.5) * resolution).to(torch.long)
+        coords = torch.clamp(coords, 0, resolution - 1)
+        ix, iy, iz = coords[:, 0], coords[:, 1], coords[:, 2]
+        lin = ix * (resolution * resolution) + iy * resolution + iz # linear index in [0, R^3)
+        sum_color = torch.zeros((resolution * resolution * resolution), dtype=torch.float32, device=device)
+        sum_color.index_add_(0, lin, colors_t)
+        count = torch.zeros((resolution * resolution * resolution,), dtype=torch.long, device=device)
+        ones = torch.ones_like(lin, dtype=torch.long)
+        count.index_add_(0, lin, ones)
+        count_f = count.to(torch.float32)
+        mean_color = sum_color / torch.clamp(count_f, min=1.0)  # empty -> divide by 1 (still 0)
+        color_mean = mean_color.view(resolution, resolution, resolution, 1).permute(3, 0, 1, 2).contiguous()
+    if return_mask:
+        ss_mask = rearrange(ss if points_color is None else color_mean, 'c (x n1) (y n2) (z n3) -> (n1 n2 n3 c) x y z', n1=4, n2=4, n3=4).float()
+        return ss , ss_mask
+    else:
+        return ss
+def get_std_cond(root, instance, crop_size, return_mask=False):
+    image_root = os.path.join(root, 'renders_cond', instance)
+    if os.path.exists(os.path.join(image_root, 'transforms.json')):
+        with open(os.path.join(image_root, 'transforms.json')) as f:
+            metadata = json.load(f)
+    else:
+        image_root = os.path.join(root, 'renders', instance)
+        with open(os.path.join(image_root, 'transforms.json')) as f:
+            metadata = json.load(f)
+    n_views = len(metadata['frames'])
+    view = np.random.randint(n_views)
+    metadata = metadata['frames'][view]
+    image_path = os.path.join(image_root, metadata['file_path'])
+    image = Image.open(image_path)
+    alpha = np.array(image.getchannel(3))
+    bbox = np.array(alpha).nonzero()
+    bbox = [bbox[1].min(), bbox[0].min(), bbox[1].max(), bbox[0].max()]
+    center = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
+    hsize = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2
+    aug_size_ratio = 1.2
+    aug_hsize = hsize * aug_size_ratio
+    aug_center_offset = [0, 0]
+    aug_center = [center[0] + aug_center_offset[0], center[1] + aug_center_offset[1]]
+    aug_bbox = [int(aug_center[0] - aug_hsize), int(aug_center[1] - aug_hsize), int(aug_center[0] + aug_hsize), int(aug_center[1] + aug_hsize)]
+    image = image.crop(aug_bbox)
+    image = image.resize((crop_size, crop_size), Image.Resampling.LANCZOS)
+    alpha = image.getchannel(3)
+    image = image.convert('RGB')
+    image = torch.tensor(np.array(image)).permute(2, 0, 1).float() / 255.0
+    alpha = torch.tensor(np.array(alpha)).float() / 255.0
+    image = image * alpha.unsqueeze(0)
+    if return_mask:
+        return image, alpha.unsqueeze(0)
+    else:
+        return image
+def map_rotated_slat2canonical_pose(vertices, rot_slat_info):
+    vertices_scale = rot_slat_info['scale']
+    vertices_trans = np.array(rot_slat_info['translation'])
+    rand_rot = rot_slat_info['rotate']
+    pcd = o3d.geometry.PointCloud()
+    vertices = vertices * vertices_scale
+    vertices = vertices + vertices_trans
+    pcd.points = o3d.utility.Vector3dVector(vertices)
+    R1 = pcd.get_rotation_matrix_from_xyz((-rand_rot[0], 0, 0))
+    R2 = pcd.get_rotation_matrix_from_xyz((0, -rand_rot[1], 0))
+    R3 = pcd.get_rotation_matrix_from_xyz((0, 0, -rand_rot[2]))
+    pcd.rotate(R3, center=(0., 0., 0.))
+    pcd.rotate(R2, center=(0., 0., 0.))
+    pcd.rotate(R1, center=(0., 0., 0.))
+    vertices = np.asarray(pcd.points)
+    return vertices
+def project2ply(mask, depth, image, K, c2w):
+    mask = torch.nonzero(mask)
+    rays_o, rays_d = get_rays(mask[:, 1], mask[:, 0], K, c2w)
+    ###
+    mask = [mask[:, 0].detach().cpu().numpy(), mask[:, 1].detach().cpu().numpy()]
+    pixel_depth = depth[mask[0], mask[1]]
+    pixel_color = image.detach().permute(1, 2, 0).cpu().numpy()[mask[0], mask[1]]
+    pixel_points = rays_o + rays_d * pixel_depth[:, None]
+    pixel_points = pixel_points.detach().cpu().numpy()
+    return pixel_points, pixel_color

threeDFixer/models/__init__.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# This file is modified from TRELLIS:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+# Modifications Copyright (c) 2026 Ze-Xin Yin, Robot labs of Horizon Robotics, and D-Robotics.
+import importlib
+__attributes = {
+    'SparseStructureEncoder': 'sparse_structure_vae',
+    'SparseStructureDecoder': 'sparse_structure_vae',
+    'SparseStructureFlowModel': 'sparse_structure_flow',
+    'SLatEncoder': 'structured_latent_vae',
+    'SLatGaussianDecoder': 'structured_latent_vae',
+    'SLatRadianceFieldDecoder': 'structured_latent_vae',
+    'SLatMeshDecoder': 'structured_latent_vae',
+    'ElasticSLatEncoder': 'structured_latent_vae',
+    'ElasticSLatGaussianDecoder': 'structured_latent_vae',
+    'ElasticSLatRadianceFieldDecoder': 'structured_latent_vae',
+    'ElasticSLatMeshDecoder': 'structured_latent_vae',
+    'SLatFlowModel': 'structured_latent_flow',
+    'ElasticSLatFlowModel': 'structured_latent_flow',
+    'SceneSLatFlowModel': 'scene_structured_latent_flow',
+    'ElasticSceneSLatFlowModel': 'scene_structured_latent_flow',
+    'SceneSparseStructureFlowModule': 'scene_sparse_structure_flow',
+}
+__submodules = []
+__all__ = list(__attributes.keys()) + __submodules
+def __getattr__(name):
+    if name not in globals():
+        if name in __attributes:
+            module_name = __attributes[name]
+            module = importlib.import_module(f".{module_name}", __name__)
+            globals()[name] = getattr(module, name)
+        elif name in __submodules:
+            module = importlib.import_module(f".{name}", __name__)
+            globals()[name] = module
+        else:
+            raise AttributeError(f"module {__name__} has no attribute {name}")
+    return globals()[name]
+def from_pretrained(path: str, **kwargs):
+    """
+    Load a model from a pretrained checkpoint.
+    Args:
+        path: The path to the checkpoint. Can be either local path or a Hugging Face model name.
+              NOTE: config file and model file should take the name f'{path}.json' and f'{path}.safetensors' respectively.
+        **kwargs: Additional arguments for the model constructor.
+    """
+    import os
+    import json
+    import torch
+    from safetensors.torch import load_file
+    from ..utils.dist_utils import read_file_dist
+    is_local = os.path.exists(f"{path}.json") and (os.path.exists(f"{path}.safetensors") or os.path.exists(f"{path}.pt"))
+    if is_local:
+        config_file = f"{path}.json"
+        model_file = f"{path}.safetensors" if os.path.exists(f"{path}.safetensors") else f"{path}.pt"
+    else:
+        from huggingface_hub import hf_hub_download
+        path_parts = path.split('/')
+        repo_id = f'{path_parts[0]}/{path_parts[1]}'
+        model_name = '/'.join(path_parts[2:])
+        config_file = hf_hub_download(repo_id, f"{model_name}.json")
+        model_file = hf_hub_download(repo_id, f"{model_name}.safetensors")
+    with open(config_file, 'r') as f:
+        config = json.load(f)
+    model = __getattr__(config['name'])(**config['args'], **kwargs)
+    if model_file.endswith(".safetensors"):
+        model.load_state_dict(load_file(model_file))
+    else:
+        model_ckpt = torch.load(read_file_dist(model_file), map_location='cpu', weights_only=True)
+        model.load_state_dict(model_ckpt)
+        if model.dtype == torch.float16:
+            model.convert_to_fp16()
+    return model
+# For Pylance
+if __name__ == '__main__':
+    from .sparse_structure_vae import (
+        SparseStructureEncoder,
+        SparseStructureDecoder,
+    )
+    from .sparse_structure_flow import SparseStructureFlowModel
+    from .structured_latent_vae import (
+        SLatEncoder,
+        SLatGaussianDecoder,
+        SLatRadianceFieldDecoder,
+        SLatMeshDecoder,
+        ElasticSLatEncoder,
+        ElasticSLatGaussianDecoder,
+        ElasticSLatRadianceFieldDecoder,
+        ElasticSLatMeshDecoder,
+    )
+    from .structured_latent_flow import (
+        SLatFlowModel,
+        ElasticSLatFlowModel,
+    )
+    from .scene_sparse_structure_flow import (
+        SceneSparseStructureFlowModule
+    )
+    from .scene_structured_latent_flow import (
+        SceneSLatFlowModel,
+        ElasticSceneSLatFlowModel
+    )

threeDFixer/models/scene_sparse_structure_flow.py ADDED Viewed

	@@ -0,0 +1,334 @@

+# This file is modified from TRELLIS:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+# Modifications Copyright (c) 2026 Ze-Xin Yin, Robot labs of Horizon Robotics, and D-Robotics.
+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from . import from_pretrained
+from ..modules.utils import convert_module_to_f16, convert_module_to_f32
+from ..modules.transformer import SceneModulatedTransformerCrossBlock
+from ..modules.spatial import patchify, unpatchify
+from .sparse_structure_flow import (
+    SparseStructureFlowModel,
+    TimestepEmbedder
+)
+def mean_flat(x):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return torch.mean(x, dim=list(range(1, len(x.size()))))
+class SceneSparseStructureFlowModule(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        model_channels: int,
+        cond_channels: int,
+        out_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        patch_size: int = 2,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        share_mod: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        pretrained_ss_flow_dit: str = None,
+        resume_ckpts: str = None,
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.cond_channels = cond_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.patch_size = patch_size
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.qk_rms_norm = qk_rms_norm
+        self.qk_rms_norm_cross = qk_rms_norm_cross
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.input_layer_vox_partial = nn.Linear(in_channels * patch_size**3, model_channels)
+        self.input_layer_mask_partial = nn.Linear(64, model_channels)
+        self.dpt_ratio_embedder = TimestepEmbedder(model_channels)
+        self.blocks = nn.ModuleList([
+            SceneModulatedTransformerCrossBlock(
+                model_channels,
+                cond_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode='full',
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                share_mod=share_mod,
+                qk_rms_norm=self.qk_rms_norm,
+                qk_rms_norm_cross=self.qk_rms_norm_cross,
+            )
+            for _ in range(num_blocks)
+        ])
+        self.control_path = nn.Sequential(*[
+            nn.Linear(model_channels, model_channels) for _ in range(num_blocks)
+        ])
+        self.neg_cache = {}
+        self.cond_vox_cache = None
+        self.initialize_weights()
+        if pretrained_ss_flow_dit is not None:
+            if pretrained_ss_flow_dit.endswith('.pt'):
+                print (f'loading pretrained weight: {pretrained_ss_flow_dit}')
+                model_ckpt = torch.load(pretrained_ss_flow_dit, map_location='cpu', weights_only=True)
+                self.input_layer_vox_partial.load_state_dict(
+                    {k.replace('input_layer.', ''): model_ckpt[k] for k in filter(lambda x: 'input_layer' in x, model_ckpt.keys())}
+                )
+                self.dpt_ratio_embedder.load_state_dict(
+                    {k.replace('t_embedder.', ''): model_ckpt[k] for k in filter(lambda x: 't_embedder' in x, model_ckpt.keys())}
+                )
+                for block_index, module in enumerate(self.blocks):
+                    module: SceneModulatedTransformerCrossBlock
+                    module.load_state_dict(
+                        {k.replace(f'blocks.{block_index}', ''): model_ckpt[k] for k in filter(lambda x: f'blocks.{block_index}' in x, model_ckpt.keys())}, strict=False
+                    )
+                    module.norm4.load_state_dict(module.norm1.state_dict())
+                    module.norm5.load_state_dict(module.norm2.state_dict())
+                    module.self_attn_dpt_ratio.load_state_dict(module.self_attn.state_dict())
+                    module.cross_attn_extra.load_state_dict(module.cross_attn.state_dict())
+                    nn.init.constant_(module.self_attn_dpt_ratio.to_out.weight, 0)
+                    if module.self_attn_dpt_ratio.to_out.bias is not None:
+                        nn.init.constant_(module.self_attn_dpt_ratio.to_out.bias, 0)
+                    nn.init.constant_(module.cross_attn_extra.to_out.weight, 0)
+                    if module.cross_attn_extra.to_out.bias is not None:
+                        nn.init.constant_(module.cross_attn_extra.to_out.bias, 0)
+                del model_ckpt
+            else:
+                print (f'loading pretrained weight: {pretrained_ss_flow_dit}')
+                pre_trained_models = from_pretrained(pretrained_ss_flow_dit)
+                pre_trained_models: SparseStructureFlowModel
+                self.input_layer_vox_partial.load_state_dict(pre_trained_models.input_layer.state_dict())
+                self.dpt_ratio_embedder.load_state_dict(pre_trained_models.t_embedder.state_dict())
+                for block_index, module in enumerate(self.blocks):
+                    module: SceneModulatedTransformerCrossBlock
+                    module.load_state_dict(pre_trained_models.blocks[block_index].state_dict(), strict=False)
+                    module.norm4.load_state_dict(module.norm1.state_dict())
+                    module.norm5.load_state_dict(module.norm2.state_dict())
+                    module.self_attn_dpt_ratio.load_state_dict(module.self_attn.state_dict())
+                    module.cross_attn_extra.load_state_dict(module.cross_attn.state_dict())
+                    nn.init.constant_(module.self_attn_dpt_ratio.to_out.weight, 0)
+                    if module.self_attn_dpt_ratio.to_out.bias is not None:
+                        nn.init.constant_(module.self_attn_dpt_ratio.to_out.bias, 0)
+                    nn.init.constant_(module.cross_attn_extra.to_out.weight, 0)
+                    if module.cross_attn_extra.to_out.bias is not None:
+                        nn.init.constant_(module.cross_attn_extra.to_out.bias, 0)
+                del pre_trained_models
+        if resume_ckpts is not None:
+            print (f'loading pretrained weight: {resume_ckpts}')
+            model_ckpt = torch.load(resume_ckpts, map_location='cpu', weights_only=True)
+            self.load_state_dict(model_ckpt, strict=False)
+            del model_ckpt
+        if use_fp16:
+            self.convert_to_fp16()
+    def clear_neg_cache(self):
+        self.neg_cache = {}
+    def clear_cond_vox_cache(self):
+        self.cond_vox_cache = None
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.blocks.apply(convert_module_to_f16)
+        self.control_path.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.blocks.apply(convert_module_to_f32)
+        self.control_path.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        for block in self.control_path:
+            nn.init.constant_(block.weight, 0)
+            nn.init.constant_(block.bias, 0)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        if self.share_mod:
+            nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+        else:
+            for block in self.blocks:
+                nn.init.constant_(block.adaLN_modulation_dpt[-1].weight, 0)
+                nn.init.constant_(block.adaLN_modulation_dpt[-1].bias, 0)
+        # Zero-out input layers:
+        nn.init.constant_(self.input_layer_mask_partial.weight, 0)
+        nn.init.constant_(self.input_layer_mask_partial.bias, 0)
+    def input_voxel(self, x, input_layer, pos_emb):
+        ########## voxel tokens
+        h = patchify(x, self.patch_size)
+        h = h.view(*h.shape[:2], -1).permute(0, 2, 1).contiguous()
+        h = input_layer(h)
+        h = h + pos_emb
+        ########## voxel tokens
+        return h
+    def input_mask(self, x, input_layer):
+        h = patchify(x, self.patch_size)
+        h = h.view(*h.shape[:2], -1).permute(0, 2, 1).contiguous()
+        h = input_layer(h)
+        return h
+    def forward(self, *args, **kwargs):
+        if kwargs.pop("w_align_loss", False):
+            return self._train_forward(*args, **kwargs, w_align_loss=True)
+        else:
+            return self._infer_forward(*args, **kwargs)
+    def _train_forward(self, x: torch.Tensor, t: torch.Tensor, cond: Dict[str,torch.Tensor],
+                forzen_denoiser: SparseStructureFlowModel, est_depth_ratio: torch.Tensor,
+                w_align_loss: bool = False) -> torch.Tensor:
+        assert [*x.shape] == [x.shape[0], self.in_channels, *[self.resolution] * 3], \
+                f"Input shape mismatch, got {x.shape}, expected {[x.shape[0], self.in_channels, *[self.resolution] * 3]}"
+        h = self.input_voxel(x, forzen_denoiser.input_layer, forzen_denoiser.pos_emb[None])
+        cond_vox = self.input_voxel(cond['cond_partial_vox'], self.input_layer_vox_partial, forzen_denoiser.pos_emb[None]) + \
+                    self.input_mask(cond['cond_partial_vox_mask'], self.input_layer_mask_partial)
+        cond_moge = cond['cond_scene']
+        cond_dino = cond['cond_instance']
+        cond_dino_masked = cond['cond_instance_masked']
+        if w_align_loss:
+            std_cond_dino = cond['std_cond_instance']
+            std_cond_dino = std_cond_dino.type(self.dtype)
+            std_h = h
+            std_h = std_h.type(self.dtype)
+        t_emb = forzen_denoiser.t_embedder(t)
+        if self.share_mod:
+            t_emb = forzen_denoiser.adaLN_modulation(t_emb)
+        t_emb = t_emb.type(self.dtype)
+        est_depth_ratio_emb = self.dpt_ratio_embedder(est_depth_ratio)
+        est_depth_ratio_emb = est_depth_ratio_emb.type(self.dtype)
+        h = h.type(self.dtype)
+        cond_control = cond_moge
+        cond_control = cond_control.type(self.dtype)
+        cond_vox = cond_vox.type(self.dtype)
+        cond_dino = cond_dino.type(self.dtype)
+        cond_dino_masked = cond_dino_masked.type(self.dtype)
+        align_loss = 0.0
+        acount = 0
+        for block_index, frozen_block in enumerate(forzen_denoiser.blocks):
+            h = frozen_block(h, t_emb, cond_dino_masked)
+            if block_index < len(self.blocks):
+                cond_vox = self.blocks[block_index](cond_vox, t_emb, est_depth_ratio_emb, cond_dino, cond_control)
+                ctrl_feats = self.control_path[block_index](cond_vox)
+                h = h + ctrl_feats
+            if w_align_loss:
+                with torch.no_grad():
+                    std_h = frozen_block(std_h, t_emb, std_cond_dino)
+                acount += 1
+                reference = std_h
+                source = h
+                z_tilde_j = torch.nn.functional.normalize(source, dim=-1, eps=1e-6)
+                z_j = torch.nn.functional.normalize(reference, dim=-1, eps=1e-6)
+                align_loss += mean_flat(-(z_j * z_tilde_j).sum(dim=-1))
+        h = h.type(x.dtype)
+        h = F.layer_norm(h, h.shape[-1:])
+        h = forzen_denoiser.out_layer(h)
+        h = h.permute(0, 2, 1).view(h.shape[0], h.shape[2], *[self.resolution // self.patch_size] * 3)
+        h = unpatchify(h, self.patch_size).contiguous()
+        if w_align_loss:
+            return h, align_loss / acount
+        else:
+            return h
+    def _infer_forward(self, x: torch.Tensor, t: torch.Tensor, cond: Dict[str,torch.Tensor],
+                forzen_denoiser: SparseStructureFlowModel, est_depth_ratio: torch.Tensor) -> torch.Tensor:
+        assert [*x.shape] == [x.shape[0], self.in_channels, *[self.resolution] * 3], \
+                f"Input shape mismatch, got {x.shape}, expected {[x.shape[0], self.in_channels, *[self.resolution] * 3]}"
+        h = self.input_voxel(x, forzen_denoiser.input_layer, forzen_denoiser.pos_emb[None])
+        cond_vox = self.input_voxel(cond['cond_partial_vox'], self.input_layer_vox_partial, forzen_denoiser.pos_emb[None]) + \
+                    self.input_mask(cond['cond_partial_vox_mask'], self.input_layer_mask_partial)
+        cond_moge = cond['cond_scene']
+        cond_dino = cond['cond_instance']
+        cond_dino_masked = cond['cond_instance_masked']
+        t_emb = forzen_denoiser.t_embedder(t)
+        if self.share_mod:
+            t_emb = forzen_denoiser.adaLN_modulation(t_emb)
+        t_emb = t_emb.type(self.dtype)
+        est_depth_ratio_emb = self.dpt_ratio_embedder(est_depth_ratio)
+        est_depth_ratio_emb = est_depth_ratio_emb.type(self.dtype)
+        h = h.type(self.dtype)
+        cond_control = cond_moge
+        cond_control = cond_control.type(self.dtype)
+        cond_vox = cond_vox.type(self.dtype)
+        cond_dino = cond_dino.type(self.dtype)
+        cond_dino_masked = cond_dino_masked.type(self.dtype)
+        for block_index, frozen_block in enumerate(forzen_denoiser.blocks):
+            h = frozen_block(h, t_emb, cond_dino_masked)
+            if block_index < len(self.blocks):
+                cond_vox = self.blocks[block_index](cond_vox, t_emb, est_depth_ratio_emb, cond_dino, cond_control)
+                ctrl_feats = self.control_path[block_index](cond_vox)
+                h = h + ctrl_feats
+        h = h.type(x.dtype)
+        h = F.layer_norm(h, h.shape[-1:])
+        h = forzen_denoiser.out_layer(h)
+        h = h.permute(0, 2, 1).view(h.shape[0], h.shape[2], *[self.resolution // self.patch_size] * 3)
+        h = unpatchify(h, self.patch_size).contiguous()
+        return h

threeDFixer/models/scene_structured_latent_flow.py ADDED Viewed

	@@ -0,0 +1,415 @@

+# This file is modified from TRELLIS:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+# Modifications Copyright (c) 2026 Ze-Xin Yin, Robot labs of Horizon Robotics, and D-Robotics.
+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from ..modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+from ..modules.transformer import AbsolutePositionEmbedder
+from ..modules.norm import LayerNorm32
+from ..modules import sparse as sp
+from ..modules.sparse.transformer import ModulatedSparseTransformerCrossBlock, ModulatedSceneSparseTransformerCrossBlock
+from .sparse_structure_flow import TimestepEmbedder
+from .scene_sparse_structure_flow import mean_flat
+from .structured_latent_flow import SparseResBlock3d, SLatFlowModel
+from .sparse_elastic_mixin import SparseTransformerElasticMixin
+from . import from_pretrained
+class SceneSLatFlowModel(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        cond_slat_channels: int,
+        model_channels: int,
+        cond_channels: int,
+        out_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        patch_size: int = 2,
+        num_io_res_blocks: int = 2,
+        io_block_channels: List[int] = None,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        use_skip_connection: bool = True,
+        share_mod: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        pretrained_flow_dit: str = None,
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.cond_slat_channels = cond_slat_channels
+        self.model_channels = model_channels
+        self.cond_channels = cond_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.patch_size = patch_size
+        self.num_io_res_blocks = num_io_res_blocks
+        self.io_block_channels = io_block_channels
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.use_skip_connection = use_skip_connection
+        self.share_mod = share_mod
+        self.qk_rms_norm = qk_rms_norm
+        self.qk_rms_norm_cross = qk_rms_norm_cross
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        if self.io_block_channels is not None:
+            assert int(np.log2(patch_size)) == np.log2(patch_size), "Patch size must be a power of 2"
+            assert np.log2(patch_size) == len(io_block_channels), "Number of IO ResBlocks must match the number of stages"
+        self.vis_ratio_embedder = TimestepEmbedder(model_channels)
+        self.input_layer = sp.SparseLinear(in_channels, model_channels if io_block_channels is None else io_block_channels[0])
+        self.input_layer_cond = sp.SparseLinear(cond_slat_channels, model_channels if io_block_channels is None else io_block_channels[0])
+        self.input_blocks = nn.ModuleList([])
+        if io_block_channels is not None:
+            for chs, next_chs in zip(io_block_channels, io_block_channels[1:] + [model_channels]):
+                self.input_blocks.extend([
+                    SparseResBlock3d(
+                        chs,
+                        model_channels,
+                        out_channels=chs,
+                    )
+                    for _ in range(num_io_res_blocks-1)
+                ])
+                self.input_blocks.append(
+                    SparseResBlock3d(
+                        chs,
+                        model_channels,
+                        out_channels=next_chs,
+                        downsample=True,
+                    )
+                )
+        self.blocks = nn.ModuleList([
+            ModulatedSceneSparseTransformerCrossBlock(
+                model_channels,
+                cond_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode='full',
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                share_mod=self.share_mod,
+                qk_rms_norm=self.qk_rms_norm,
+                qk_rms_norm_cross=self.qk_rms_norm_cross,
+            )
+            for _ in range(num_blocks)
+        ])
+        self.control_path = nn.Sequential(*[
+            sp.SparseLinear(model_channels, model_channels) for _ in range(num_blocks)
+        ])
+        self.initialize_weights()
+        if pretrained_flow_dit is not None:
+            if pretrained_flow_dit.endswith('.pt'):
+                print (f'loading pretrained weight: {pretrained_flow_dit}')
+                model_ckpt = torch.load(pretrained_flow_dit, map_location='cpu', weights_only=True)
+                self.input_layer.load_state_dict(
+                    {k.replace('input_layer.', ''): model_ckpt[k] for k in filter(lambda x: 'input_layer' in x, model_ckpt.keys())}
+                )
+                self.vis_ratio_embedder.load_state_dict(
+                    {k.replace('t_embedder.', ''): model_ckpt[k] for k in filter(lambda x: 't_embedder' in x, model_ckpt.keys())}
+                )
+                self.input_blocks.load_state_dict(
+                    {k.replace('input_blocks.', ''): model_ckpt[k] for k in filter(lambda x: 'input_blocks' in x, model_ckpt.keys())}
+                )
+                for block_index, module in enumerate(self.blocks):
+                    module: ModulatedSceneSparseTransformerCrossBlock
+                    module.load_state_dict(
+                        {k.replace(f'blocks.{block_index}', ''): model_ckpt[k] for k in filter(lambda x: f'blocks.{block_index}' in x, model_ckpt.keys())}, strict=False
+                    )
+                    module.norm4.load_state_dict(module.norm1.state_dict())
+                    module.norm5.load_state_dict(module.norm2.state_dict())
+                    module.self_attn_vis_ratio.load_state_dict(module.self_attn.state_dict())
+                    module.cross_attn_extra.load_state_dict(module.cross_attn.state_dict())
+                    nn.init.constant_(module.self_attn_vis_ratio.to_out.weight, 0)
+                    if module.self_attn_vis_ratio.to_out.bias is not None:
+                        nn.init.constant_(module.self_attn_vis_ratio.to_out.bias, 0)
+                    nn.init.constant_(module.cross_attn_extra.to_out.weight, 0)
+                    if module.cross_attn_extra.to_out.bias is not None:
+                        nn.init.constant_(module.cross_attn_extra.to_out.bias, 0)
+                del model_ckpt
+            else:
+                print (f'loading pretrained weight: {pretrained_flow_dit}')
+                pre_trained_models = from_pretrained(pretrained_flow_dit)
+                pre_trained_models: SLatFlowModel
+                self.input_layer.load_state_dict(pre_trained_models.input_layer.state_dict())
+                self.vis_ratio_embedder.load_state_dict(pre_trained_models.t_embedder.state_dict())
+                self.input_blocks.load_state_dict(pre_trained_models.input_blocks.state_dict())
+                for block_index, module in enumerate(self.blocks):
+                    module: ModulatedSceneSparseTransformerCrossBlock
+                    module.load_state_dict(pre_trained_models.blocks[block_index].state_dict(), strict=False)
+                    module.norm4.load_state_dict(module.norm1.state_dict())
+                    module.norm5.load_state_dict(module.norm2.state_dict())
+                    module.self_attn_vis_ratio.load_state_dict(module.self_attn.state_dict())
+                    module.cross_attn_extra.load_state_dict(module.cross_attn.state_dict())
+                    nn.init.constant_(module.self_attn_vis_ratio.to_out.weight, 0)
+                    if module.self_attn_vis_ratio.to_out.bias is not None:
+                        nn.init.constant_(module.self_attn_vis_ratio.to_out.bias, 0)
+                    nn.init.constant_(module.cross_attn_extra.to_out.weight, 0)
+                    if module.cross_attn_extra.to_out.bias is not None:
+                        nn.init.constant_(module.cross_attn_extra.to_out.bias, 0)
+                del pre_trained_models
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.blocks.apply(convert_module_to_f16)
+        self.control_path.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.blocks.apply(convert_module_to_f32)
+        self.control_path.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.vis_ratio_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.vis_ratio_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        if self.share_mod:
+            nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+        else:
+            for block in self.blocks:
+                nn.init.constant_(block.adaLN_modulation_vis[-1].weight, 0)
+                nn.init.constant_(block.adaLN_modulation_vis[-1].bias, 0)
+        for block in self.control_path:
+            nn.init.constant_(block.weight, 0)
+            nn.init.constant_(block.bias, 0)
+    def forward(self, *args, **kwargs):
+        stage = kwargs.pop('stage', None)
+        if stage == 'train':
+            return self._train_forward(*args, **kwargs)
+        elif stage == 'infer':
+            return self._infer_forward(*args, **kwargs)
+        elif stage == 'infer_std':
+            return self._infer_std_forward(*args, **kwargs)
+    def _input_slat(self, x: sp.SparseTensor, emb: torch.Tensor,
+                       input_layer: Callable, input_blocks: Callable,
+                       pos_embedder: Callable, residual_h: Callable = None
+                       ):
+        h = input_layer(x).type(self.dtype)
+        skips = []
+        # pack with input blocks
+        for block in input_blocks:
+            h = block(h, emb)
+            skips.append(h.feats)
+        if self.pe_mode == "ape" and pos_embedder is not None:
+            h = h + pos_embedder(h.coords[:, 1:]).type(self.dtype)
+        if residual_h is not None:
+            h = residual_h(h)
+        return h, skips
+    def _train_forward(self, x: sp.SparseTensor, t: torch.Tensor, cond: Dict[str,torch.Tensor], vis_ratio: torch.Tensor,
+                forzen_denoiser: SLatFlowModel) -> sp.SparseTensor:
+        t_emb = forzen_denoiser.t_embedder(t)
+        if forzen_denoiser.share_mod:
+            t_emb = forzen_denoiser.adaLN_modulation(t_emb)
+        t_emb = t_emb.type(self.dtype)
+        # moge feats and image mask
+        cond_moge = cond['cond_scene']
+        cond_dino = cond['cond_instance']
+        cond_dino_masked = cond['cond_instance_masked']
+        std_cond_dino = cond['std_cond_instance']
+        # voxels with projected feats
+        x_feat = cond['cond_voxel_feats']
+        cond_control = cond_moge
+        cond_control = cond_control.type(self.dtype)
+        cond_dino_masked = cond_dino_masked.type(self.dtype)
+        cond_dino = cond_dino.type(self.dtype)
+        std_cond_dino = std_cond_dino.type(self.dtype)
+        vis_ratio_emb = self.vis_ratio_embedder(vis_ratio)
+        vis_ratio_emb = vis_ratio_emb.type(self.dtype)
+        # input layer of frozen part
+        h, skips = self._input_slat(x, t_emb, self.input_layer,
+                              forzen_denoiser.input_blocks,
+                              forzen_denoiser.pos_embedder if self.pe_mode == "ape" else None)
+        # input layer of frozen part
+        # condition branch
+        ctrl_h, _ = self._input_slat(x_feat, vis_ratio_emb,
+                               self.input_layer_cond, self.input_blocks,
+                               forzen_denoiser.pos_embedder if self.pe_mode == "ape" else None)
+        # condition branch
+        std_h = h
+        align_loss = 0.0
+        acount = 0
+        for block_index, block in enumerate(forzen_denoiser.blocks):
+            h = block(h, t_emb, cond_dino_masked)
+            if block_index < self.num_blocks:
+                ctrl_h = self.blocks[block_index](ctrl_h, t_emb, vis_ratio_emb, cond_dino, cond_control)
+                h = h + self.control_path[block_index](ctrl_h)
+            std_h = block(std_h, t_emb, std_cond_dino)
+            std_h: sp.SparseTensor
+            h: sp.SparseTensor
+            for batch_std_h, batch_h in zip(sp.sparse_unbind(std_h, dim=0), sp.sparse_unbind(h, dim=0)):
+                acount += 1
+                reference_feats = batch_std_h.feats
+                source_feats = batch_h.feats
+                z_tilde_j = torch.nn.functional.normalize(source_feats, dim=-1, eps=1e-6)
+                z_j = torch.nn.functional.normalize(reference_feats, dim=-1, eps=1e-6)
+                align_loss += mean_flat(-(z_j * z_tilde_j).sum(dim=-1))
+        align_loss /= acount
+        # unpack with output blocks
+        for block, skip in zip(forzen_denoiser.out_blocks, reversed(skips)):
+            if self.use_skip_connection:
+                h = block(h.replace(torch.cat([h.feats, skip], dim=1)), t_emb)
+            else:
+                h = block(h, t_emb)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = forzen_denoiser.out_layer(h.type(x.dtype))
+        return h, align_loss
+    def _infer_forward(self, x: sp.SparseTensor, t: torch.Tensor, cond: Dict[str,torch.Tensor], vis_ratio: torch.Tensor,
+                forzen_denoiser: SLatFlowModel) -> sp.SparseTensor:
+        t_emb = forzen_denoiser.t_embedder(t)
+        if forzen_denoiser.share_mod:
+            t_emb = forzen_denoiser.adaLN_modulation(t_emb)
+        t_emb = t_emb.type(self.dtype)
+        # moge feats and image mask
+        cond_moge = cond['cond_scene']
+        cond_dino = cond['cond_instance']
+        cond_dino_masked = cond['cond_instance_masked']
+        # voxels with projected feats
+        x_feat = cond['cond_voxel_feats']
+        neg_infer = cond.pop("neg_infer", False)
+        cond_control = cond_moge
+        cond_control = cond_control.type(self.dtype)
+        cond_dino = cond_dino.type(self.dtype)
+        cond_dino_masked = cond_dino_masked.type(self.dtype)
+        vis_ratio_emb = self.vis_ratio_embedder(vis_ratio)
+        vis_ratio_emb = vis_ratio_emb.type(self.dtype)
+        # input layer of frozen part
+        h, skips = self._input_slat(x, t_emb, self.input_layer,
+                              forzen_denoiser.input_blocks,
+                              forzen_denoiser.pos_embedder if self.pe_mode == "ape" else None)
+        # input layer of frozen part
+        # condition branch
+        if not neg_infer:
+            ctrl_h, _ = self._input_slat(x_feat, vis_ratio_emb, self.input_layer_cond,
+                                forzen_denoiser.input_blocks,
+                                forzen_denoiser.pos_embedder if self.pe_mode == "ape" else None)
+        # condition branch
+        for block_index, block in enumerate(forzen_denoiser.blocks):
+            h = block(h, t_emb, cond_dino_masked)
+            if not neg_infer:
+                if block_index < self.num_blocks:
+                    ctrl_h = self.blocks[block_index](ctrl_h, t_emb, vis_ratio_emb, cond_dino, cond_control)
+                    h = h + self.control_path[block_index](ctrl_h)
+        # unpack with output blocks
+        for block, skip in zip(forzen_denoiser.out_blocks, reversed(skips)):
+            if self.use_skip_connection:
+                h = block(h.replace(torch.cat([h.feats, skip], dim=1)), t_emb)
+            else:
+                h = block(h, t_emb)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = forzen_denoiser.out_layer(h.type(x.dtype))
+        return h
+    def _infer_std_forward(self, x: sp.SparseTensor, t: torch.Tensor, cond: Dict[str,torch.Tensor], vis_ratio: torch.Tensor,
+                forzen_denoiser: SLatFlowModel) -> sp.SparseTensor:
+        t_emb = forzen_denoiser.t_embedder(t)
+        if forzen_denoiser.share_mod:
+            t_emb = forzen_denoiser.adaLN_modulation(t_emb)
+        t_emb = t_emb.type(self.dtype)
+        cond_dino = cond['std_cond_instance']
+        cond_dino = cond_dino.type(self.dtype)
+        # input layer of frozen part
+        h, skips = self._input_slat(x, t_emb, forzen_denoiser.input_layer,
+                              forzen_denoiser.input_blocks,
+                              forzen_denoiser.pos_embedder if self.pe_mode == "ape" else None)
+        # input layer of frozen part
+        for block_index, block in enumerate(forzen_denoiser.blocks):
+            h = block(h, t_emb, cond_dino)
+        # unpack with output blocks
+        for block, skip in zip(forzen_denoiser.out_blocks, reversed(skips)):
+            if self.use_skip_connection:
+                h = block(h.replace(torch.cat([h.feats, skip], dim=1)), t_emb)
+            else:
+                h = block(h, t_emb)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = forzen_denoiser.out_layer(h.type(x.dtype))
+        return h
+class ElasticSceneSLatFlowModel(SparseTransformerElasticMixin, SceneSLatFlowModel):
+    """
+    SLat Flow Model with elastic memory management.
+    Used for training with low VRAM.
+    """
+    pass

threeDFixer/models/sparse_elastic_mixin.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from contextlib import contextmanager
+from typing import *
+import math
+from ..modules import sparse as sp
+from ..utils.elastic_utils import ElasticModuleMixin
+class SparseTransformerElasticMixin(ElasticModuleMixin):
+    def _get_input_size(self, x: sp.SparseTensor, *args, **kwargs):
+        return x.feats.shape[0]
+    @contextmanager
+    def with_mem_ratio(self, mem_ratio=1.0):
+        if mem_ratio == 1.0:
+            yield 1.0
+            return
+        num_blocks = len(self.blocks)
+        num_checkpoint_blocks = min(math.ceil((1 - mem_ratio) * num_blocks) + 1, num_blocks)
+        exact_mem_ratio = 1 - (num_checkpoint_blocks - 1) / num_blocks
+        for i in range(num_blocks):
+            self.blocks[i].use_checkpoint = i < num_checkpoint_blocks
+        yield exact_mem_ratio
+        for i in range(num_blocks):
+            self.blocks[i].use_checkpoint = False

threeDFixer/models/sparse_structure_flow.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# Copied from https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+# Minor modifications by Ze-Xin Yin and Robot labs of Horizon Robotics, 2026.
+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from . import from_pretrained
+from ..modules.utils import convert_module_to_f16, convert_module_to_f32
+from ..modules.transformer import AbsolutePositionEmbedder, ModulatedTransformerCrossBlock
+from ..modules.spatial import patchify, unpatchify
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        Args:
+            t: a 1-D Tensor of N indices, one per batch element.
+                These may be fractional.
+            dim: the dimension of the output.
+            max_period: controls the minimum frequency of the embeddings.
+        Returns:
+            an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -np.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class SparseStructureFlowModel(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        model_channels: int,
+        cond_channels: int,
+        out_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        patch_size: int = 2,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        share_mod: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        pretrained_ss_flow_dit: str = None,
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.cond_channels = cond_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.patch_size = patch_size
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.qk_rms_norm = qk_rms_norm
+        self.qk_rms_norm_cross = qk_rms_norm_cross
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.t_embedder = TimestepEmbedder(model_channels)
+        if share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(model_channels, 6 * model_channels, bias=True)
+            )
+        if pe_mode == "ape":
+            pos_embedder = AbsolutePositionEmbedder(model_channels, 3)
+            coords = torch.meshgrid(*[torch.arange(res, device=self.device) for res in [resolution // patch_size] * 3], indexing='ij')
+            coords = torch.stack(coords, dim=-1).reshape(-1, 3)
+            pos_emb = pos_embedder(coords)
+            self.register_buffer("pos_emb", pos_emb)
+        self.input_layer = nn.Linear(in_channels * patch_size**3, model_channels)
+        self.blocks = nn.ModuleList([
+            ModulatedTransformerCrossBlock(
+                model_channels,
+                cond_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode='full',
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                share_mod=share_mod,
+                qk_rms_norm=self.qk_rms_norm,
+                qk_rms_norm_cross=self.qk_rms_norm_cross,
+            )
+            for _ in range(num_blocks)
+        ])
+        self.out_layer = nn.Linear(model_channels, out_channels * patch_size**3)
+        self.initialize_weights()
+        if pretrained_ss_flow_dit is not None:
+            if pretrained_ss_flow_dit.endswith('.pt'):
+                print (f'loading pretrained weight: {pretrained_ss_flow_dit}')
+                model_ckpt = torch.load(pretrained_ss_flow_dit, map_location='cpu', weights_only=True)
+                self.load_state_dict(model_ckpt)
+                del model_ckpt
+            else:
+                print (f'loading pretrained weight: {pretrained_ss_flow_dit}')
+                pre_trained_models = from_pretrained(pretrained_ss_flow_dit)
+                pre_trained_models: SparseStructureFlowModel
+                self.load_state_dict(pre_trained_models.state_dict())
+                del pre_trained_models
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.blocks.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        if self.share_mod:
+            nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+        else:
+            for block in self.blocks:
+                nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+                nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def forward(self, x: torch.Tensor, t: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
+        assert [*x.shape] == [x.shape[0], self.in_channels, *[self.resolution] * 3], \
+                f"Input shape mismatch, got {x.shape}, expected {[x.shape[0], self.in_channels, *[self.resolution] * 3]}"
+        h = patchify(x, self.patch_size)
+        h = h.view(*h.shape[:2], -1).permute(0, 2, 1).contiguous()
+        h = self.input_layer(h)
+        h = h + self.pos_emb[None]
+        t_emb = self.t_embedder(t)
+        if self.share_mod:
+            t_emb = self.adaLN_modulation(t_emb)
+        t_emb = t_emb.type(self.dtype)
+        h = h.type(self.dtype)
+        cond = cond.type(self.dtype)
+        for block in self.blocks:
+            h = block(h, t_emb, cond)
+        h = h.type(x.dtype)
+        h = F.layer_norm(h, h.shape[-1:])
+        h = self.out_layer(h)
+        h = h.permute(0, 2, 1).view(h.shape[0], h.shape[2], *[self.resolution // self.patch_size] * 3)
+        h = unpatchify(h, self.patch_size).contiguous()
+        return h

threeDFixer/models/sparse_structure_vae.py ADDED Viewed

	@@ -0,0 +1,325 @@

+# Copied from https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+# Minor modifications by Ze-Xin Yin and Robot labs of Horizon Robotics, 2026.
+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..modules.norm import GroupNorm32, ChannelLayerNorm32
+from ..modules.spatial import pixel_shuffle_3d
+from ..modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+from . import from_pretrained
+def norm_layer(norm_type: str, *args, **kwargs) -> nn.Module:
+    """
+    Return a normalization layer.
+    """
+    if norm_type == "group":
+        return GroupNorm32(32, *args, **kwargs)
+    elif norm_type == "layer":
+        return ChannelLayerNorm32(*args, **kwargs)
+    else:
+        raise ValueError(f"Invalid norm type {norm_type}")
+class ResBlock3d(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        out_channels: Optional[int] = None,
+        norm_type: Literal["group", "layer"] = "layer",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.norm1 = norm_layer(norm_type, channels)
+        self.norm2 = norm_layer(norm_type, self.out_channels)
+        self.conv1 = nn.Conv3d(channels, self.out_channels, 3, padding=1)
+        self.conv2 = zero_module(nn.Conv3d(self.out_channels, self.out_channels, 3, padding=1))
+        self.skip_connection = nn.Conv3d(channels, self.out_channels, 1) if channels != self.out_channels else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.norm1(x)
+        h = F.silu(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = F.silu(h)
+        h = self.conv2(h)
+        h = h + self.skip_connection(x)
+        return h
+class DownsampleBlock3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        mode: Literal["conv", "avgpool"] = "conv",
+    ):
+        assert mode in ["conv", "avgpool"], f"Invalid mode {mode}"
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if mode == "conv":
+            self.conv = nn.Conv3d(in_channels, out_channels, 2, stride=2)
+        elif mode == "avgpool":
+            assert in_channels == out_channels, "Pooling mode requires in_channels to be equal to out_channels"
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, "conv"):
+            return self.conv(x)
+        else:
+            return F.avg_pool3d(x, 2)
+class UpsampleBlock3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        mode: Literal["conv", "nearest"] = "conv",
+    ):
+        assert mode in ["conv", "nearest"], f"Invalid mode {mode}"
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if mode == "conv":
+            self.conv = nn.Conv3d(in_channels, out_channels*8, 3, padding=1)
+        elif mode == "nearest":
+            assert in_channels == out_channels, "Nearest mode requires in_channels to be equal to out_channels"
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, "conv"):
+            x = self.conv(x)
+            return pixel_shuffle_3d(x, 2)
+        else:
+            return F.interpolate(x, scale_factor=2, mode="nearest")
+class SparseStructureEncoder(nn.Module):
+    """
+    Encoder for Sparse Structure (\mathcal{E}_S in the paper Sec. 3.3).
+    Args:
+        in_channels (int): Channels of the input.
+        latent_channels (int): Channels of the latent representation.
+        num_res_blocks (int): Number of residual blocks at each resolution.
+        channels (List[int]): Channels of the encoder blocks.
+        num_res_blocks_middle (int): Number of residual blocks in the middle.
+        norm_type (Literal["group", "layer"]): Type of normalization layer.
+        use_fp16 (bool): Whether to use FP16.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        latent_channels: int,
+        num_res_blocks: int,
+        channels: List[int],
+        num_res_blocks_middle: int = 2,
+        norm_type: Literal["group", "layer"] = "layer",
+        use_fp16: bool = False,
+        pretrained_ss_enc: str = None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.latent_channels = latent_channels
+        self.num_res_blocks = num_res_blocks
+        self.channels = channels
+        self.num_res_blocks_middle = num_res_blocks_middle
+        self.norm_type = norm_type
+        self.use_fp16 = use_fp16
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.input_layer = nn.Conv3d(in_channels, channels[0], 3, padding=1)
+        self.blocks = nn.ModuleList([])
+        for i, ch in enumerate(channels):
+            self.blocks.extend([
+                ResBlock3d(ch, ch)
+                for _ in range(num_res_blocks)
+            ])
+            if i < len(channels) - 1:
+                self.blocks.append(
+                    DownsampleBlock3d(ch, channels[i+1])
+                )
+        self.middle_block = nn.Sequential(*[
+            ResBlock3d(channels[-1], channels[-1])
+            for _ in range(num_res_blocks_middle)
+        ])
+        self.out_layer = nn.Sequential(
+            norm_layer(norm_type, channels[-1]),
+            nn.SiLU(),
+            nn.Conv3d(channels[-1], latent_channels*2, 3, padding=1)
+        )
+        if pretrained_ss_enc is not None:
+            if pretrained_ss_enc.endswith('.pt'):
+                print (f'loading pretrained weight: {pretrained_ss_enc}')
+                model_ckpt = torch.load(pretrained_ss_enc, map_location='cpu', weights_only=True)
+                self.load_state_dict(model_ckpt)
+                del model_ckpt
+            else:
+                print (f'loading pretrained weight: {pretrained_ss_enc}')
+                pre_trained_models = from_pretrained(pretrained_ss_enc)
+                pre_trained_models: SparseStructureEncoder
+                self.load_state_dict(pre_trained_models.state_dict())
+                del pre_trained_models
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.use_fp16 = True
+        self.dtype = torch.float16
+        self.blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.use_fp16 = False
+        self.dtype = torch.float32
+        self.blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+    def forward(self, x: torch.Tensor, sample_posterior: bool = False, return_raw: bool = False) -> torch.Tensor:
+        h = self.input_layer(x)
+        h = h.type(self.dtype)
+        for block in self.blocks:
+            h = block(h)
+        h = self.middle_block(h)
+        h = h.type(x.dtype)
+        h = self.out_layer(h)
+        mean, logvar = h.chunk(2, dim=1)
+        if sample_posterior:
+            std = torch.exp(0.5 * logvar)
+            z = mean + std * torch.randn_like(std)
+        else:
+            z = mean
+        if return_raw:
+            return z, mean, logvar
+        return z
+class SparseStructureDecoder(nn.Module):
+    """
+    Decoder for Sparse Structure (\mathcal{D}_S in the paper Sec. 3.3).
+    Args:
+        out_channels (int): Channels of the output.
+        latent_channels (int): Channels of the latent representation.
+        num_res_blocks (int): Number of residual blocks at each resolution.
+        channels (List[int]): Channels of the decoder blocks.
+        num_res_blocks_middle (int): Number of residual blocks in the middle.
+        norm_type (Literal["group", "layer"]): Type of normalization layer.
+        use_fp16 (bool): Whether to use FP16.
+    """
+    def __init__(
+        self,
+        out_channels: int,
+        latent_channels: int,
+        num_res_blocks: int,
+        channels: List[int],
+        num_res_blocks_middle: int = 2,
+        norm_type: Literal["group", "layer"] = "layer",
+        use_fp16: bool = False,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.latent_channels = latent_channels
+        self.num_res_blocks = num_res_blocks
+        self.channels = channels
+        self.num_res_blocks_middle = num_res_blocks_middle
+        self.norm_type = norm_type
+        self.use_fp16 = use_fp16
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.input_layer = nn.Conv3d(latent_channels, channels[0], 3, padding=1)
+        self.middle_block = nn.Sequential(*[
+            ResBlock3d(channels[0], channels[0])
+            for _ in range(num_res_blocks_middle)
+        ])
+        self.blocks = nn.ModuleList([])
+        for i, ch in enumerate(channels):
+            self.blocks.extend([
+                ResBlock3d(ch, ch)
+                for _ in range(num_res_blocks)
+            ])
+            if i < len(channels) - 1:
+                self.blocks.append(
+                    UpsampleBlock3d(ch, channels[i+1])
+                )
+        self.out_layer = nn.Sequential(
+            norm_layer(norm_type, channels[-1]),
+            nn.SiLU(),
+            nn.Conv3d(channels[-1], out_channels, 3, padding=1)
+        )
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.use_fp16 = True
+        self.dtype = torch.float16
+        self.blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.use_fp16 = False
+        self.dtype = torch.float32
+        self.blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.input_layer(x)
+        h = h.type(self.dtype)
+        h = self.middle_block(h)
+        for block in self.blocks:
+            h = block(h)
+        h = h.type(x.dtype)
+        h = self.out_layer(h)
+        return h

threeDFixer/models/structured_latent_flow.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# Copied from https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+# Minor modifications by Ze-Xin Yin and Robot labs of Horizon Robotics, 2026.
+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from ..modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+from ..modules.transformer import AbsolutePositionEmbedder
+from ..modules.norm import LayerNorm32
+from ..modules import sparse as sp
+from ..modules.sparse.transformer import ModulatedSparseTransformerCrossBlock
+from .sparse_structure_flow import TimestepEmbedder
+from .sparse_elastic_mixin import SparseTransformerElasticMixin
+from . import from_pretrained
+class SparseResBlock3d(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        emb_channels: int,
+        out_channels: Optional[int] = None,
+        downsample: bool = False,
+        upsample: bool = False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.out_channels = out_channels or channels
+        self.downsample = downsample
+        self.upsample = upsample
+        assert not (downsample and upsample), "Cannot downsample and upsample at the same time"
+        self.norm1 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm2 = LayerNorm32(self.out_channels, elementwise_affine=False, eps=1e-6)
+        self.conv1 = sp.SparseConv3d(channels, self.out_channels, 3)
+        self.conv2 = zero_module(sp.SparseConv3d(self.out_channels, self.out_channels, 3))
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(emb_channels, 2 * self.out_channels, bias=True),
+        )
+        self.skip_connection = sp.SparseLinear(channels, self.out_channels) if channels != self.out_channels else nn.Identity()
+        self.updown = None
+        if self.downsample:
+            self.updown = sp.SparseDownsample(2)
+        elif self.upsample:
+            self.updown = sp.SparseUpsample(2)
+    def _updown(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        if self.updown is not None:
+            x = self.updown(x)
+        return x
+    def forward(self, x: sp.SparseTensor, emb: torch.Tensor) -> sp.SparseTensor:
+        emb_out = self.emb_layers(emb).type(x.dtype)
+        scale, shift = torch.chunk(emb_out, 2, dim=1)
+        x = self._updown(x)
+        h = x.replace(self.norm1(x.feats))
+        h = h.replace(F.silu(h.feats))
+        h = self.conv1(h)
+        h = h.replace(self.norm2(h.feats)) * (1 + scale) + shift
+        h = h.replace(F.silu(h.feats))
+        h = self.conv2(h)
+        h = h + self.skip_connection(x)
+        return h
+class SLatFlowModel(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        model_channels: int,
+        cond_channels: int,
+        out_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        patch_size: int = 2,
+        num_io_res_blocks: int = 2,
+        io_block_channels: List[int] = None,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        use_skip_connection: bool = True,
+        share_mod: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        pretrained_flow_dit: str = None,
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.cond_channels = cond_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.patch_size = patch_size
+        self.num_io_res_blocks = num_io_res_blocks
+        self.io_block_channels = io_block_channels
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.use_skip_connection = use_skip_connection
+        self.share_mod = share_mod
+        self.qk_rms_norm = qk_rms_norm
+        self.qk_rms_norm_cross = qk_rms_norm_cross
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        if self.io_block_channels is not None:
+            assert int(np.log2(patch_size)) == np.log2(patch_size), "Patch size must be a power of 2"
+            assert np.log2(patch_size) == len(io_block_channels), "Number of IO ResBlocks must match the number of stages"
+        self.t_embedder = TimestepEmbedder(model_channels)
+        if share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(model_channels, 6 * model_channels, bias=True)
+            )
+        if pe_mode == "ape":
+            self.pos_embedder = AbsolutePositionEmbedder(model_channels)
+        self.input_layer = sp.SparseLinear(in_channels, model_channels if io_block_channels is None else io_block_channels[0])
+        self.input_blocks = nn.ModuleList([])
+        if io_block_channels is not None:
+            for chs, next_chs in zip(io_block_channels, io_block_channels[1:] + [model_channels]):
+                self.input_blocks.extend([
+                    SparseResBlock3d(
+                        chs,
+                        model_channels,
+                        out_channels=chs,
+                    )
+                    for _ in range(num_io_res_blocks-1)
+                ])
+                self.input_blocks.append(
+                    SparseResBlock3d(
+                        chs,
+                        model_channels,
+                        out_channels=next_chs,
+                        downsample=True,
+                    )
+                )
+        self.blocks = nn.ModuleList([
+            ModulatedSparseTransformerCrossBlock(
+                model_channels,
+                cond_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode='full',
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                share_mod=self.share_mod,
+                qk_rms_norm=self.qk_rms_norm,
+                qk_rms_norm_cross=self.qk_rms_norm_cross,
+            )
+            for _ in range(num_blocks)
+        ])
+        self.out_blocks = nn.ModuleList([])
+        if io_block_channels is not None:
+            for chs, prev_chs in zip(reversed(io_block_channels), [model_channels] + list(reversed(io_block_channels[1:]))):
+                self.out_blocks.append(
+                    SparseResBlock3d(
+                        prev_chs * 2 if self.use_skip_connection else prev_chs,
+                        model_channels,
+                        out_channels=chs,
+                        upsample=True,
+                    )
+                )
+                self.out_blocks.extend([
+                    SparseResBlock3d(
+                        chs * 2 if self.use_skip_connection else chs,
+                        model_channels,
+                        out_channels=chs,
+                    )
+                    for _ in range(num_io_res_blocks-1)
+                ])
+        self.out_layer = sp.SparseLinear(model_channels if io_block_channels is None else io_block_channels[0], out_channels)
+        self.initialize_weights()
+        if pretrained_flow_dit is not None:
+            if pretrained_flow_dit.endswith('.pt'):
+                print (f'loading pretrained weight: {pretrained_flow_dit}')
+                model_ckpt = torch.load(pretrained_flow_dit, map_location='cpu', weights_only=True)
+                self.load_state_dict(model_ckpt)
+                del model_ckpt
+            else:
+                print (f'loading pretrained weight: {pretrained_flow_dit}')
+                pre_trained_models: SLatFlowModel
+                pre_trained_models = from_pretrained(pretrained_flow_dit)
+                self.load_state_dict(pre_trained_models.state_dict())
+                del pre_trained_models
+        if use_fp16:
+            self.convert_to_fp16()
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.blocks.apply(convert_module_to_f16)
+        self.out_blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.blocks.apply(convert_module_to_f32)
+        self.out_blocks.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        if self.share_mod:
+            nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+        else:
+            for block in self.blocks:
+                nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+                nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def forward(self, x: sp.SparseTensor, t: torch.Tensor, cond: torch.Tensor) -> sp.SparseTensor:
+        h = self.input_layer(x).type(self.dtype)
+        t_emb = self.t_embedder(t)
+        if self.share_mod:
+            t_emb = self.adaLN_modulation(t_emb)
+        t_emb = t_emb.type(self.dtype)
+        cond = cond.type(self.dtype)
+        skips = []
+        # pack with input blocks
+        for block in self.input_blocks:
+            h = block(h, t_emb)
+            skips.append(h.feats)
+        if self.pe_mode == "ape":
+            h = h + self.pos_embedder(h.coords[:, 1:]).type(self.dtype)
+        for block in self.blocks:
+            h = block(h, t_emb, cond)
+        # unpack with output blocks
+        for block, skip in zip(self.out_blocks, reversed(skips)):
+            if self.use_skip_connection:
+                h = block(h.replace(torch.cat([h.feats, skip], dim=1)), t_emb)
+            else:
+                h = block(h, t_emb)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = self.out_layer(h.type(x.dtype))
+        return h
+class ElasticSLatFlowModel(SparseTransformerElasticMixin, SLatFlowModel):
+    """
+    SLat Flow Model with elastic memory management.
+    Used for training with low VRAM.
+    """
+    pass

threeDFixer/models/structured_latent_vae/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from .encoder import SLatEncoder, ElasticSLatEncoder
+from .decoder_gs import SLatGaussianDecoder, ElasticSLatGaussianDecoder
+from .decoder_rf import SLatRadianceFieldDecoder, ElasticSLatRadianceFieldDecoder
+from .decoder_mesh import SLatMeshDecoder, ElasticSLatMeshDecoder

threeDFixer/models/structured_latent_vae/base.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+import torch
+import torch.nn as nn
+from ...modules.utils import convert_module_to_f16, convert_module_to_f32
+from ...modules import sparse as sp
+from ...modules.transformer import AbsolutePositionEmbedder
+from ...modules.sparse.transformer import SparseTransformerBlock
+def block_attn_config(self):
+    """
+    Return the attention configuration of the model.
+    """
+    for i in range(self.num_blocks):
+        if self.attn_mode == "shift_window":
+            yield "serialized", self.window_size, 0, (16 * (i % 2),) * 3, sp.SerializeMode.Z_ORDER
+        elif self.attn_mode == "shift_sequence":
+            yield "serialized", self.window_size, self.window_size // 2 * (i % 2), (0, 0, 0), sp.SerializeMode.Z_ORDER
+        elif self.attn_mode == "shift_order":
+            yield "serialized", self.window_size, 0, (0, 0, 0), sp.SerializeModes[i % 4]
+        elif self.attn_mode == "full":
+            yield "full", None, None, None, None
+        elif self.attn_mode == "swin":
+            yield "windowed", self.window_size, None, self.window_size // 2 * (i % 2), None
+class SparseTransformerBase(nn.Module):
+    """
+    Sparse Transformer without output layers.
+    Serve as the base class for encoder and decoder.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        model_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.num_blocks = num_blocks
+        self.window_size = window_size
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.attn_mode = attn_mode
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.qk_rms_norm = qk_rms_norm
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        if pe_mode == "ape":
+            self.pos_embedder = AbsolutePositionEmbedder(model_channels)
+        self.input_layer = sp.SparseLinear(in_channels, model_channels)
+        self.blocks = nn.ModuleList([
+            SparseTransformerBlock(
+                model_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode=attn_mode,
+                window_size=window_size,
+                shift_sequence=shift_sequence,
+                shift_window=shift_window,
+                serialize_mode=serialize_mode,
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                qk_rms_norm=self.qk_rms_norm,
+            )
+            for attn_mode, window_size, shift_sequence, shift_window, serialize_mode in block_attn_config(self)
+        ])
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.blocks.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+    def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        h = self.input_layer(x)
+        if self.pe_mode == "ape":
+            h = h + self.pos_embedder(x.coords[:, 1:])
+        h = h.type(self.dtype)
+        for block in self.blocks:
+            h = block(h)
+        return h

threeDFixer/models/structured_latent_vae/decoder_gs.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...modules import sparse as sp
+from ...utils.random_utils import hammersley_sequence
+from .base import SparseTransformerBase
+from ...representations import Gaussian
+from ..sparse_elastic_mixin import SparseTransformerElasticMixin
+from .. import from_pretrained
+class SLatGaussianDecoder(SparseTransformerBase):
+    def __init__(
+        self,
+        resolution: int,
+        model_channels: int,
+        latent_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
+        window_size: int = 8,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+        representation_config: dict = None,
+        pretrained_gs_dec: str = None,
+    ):
+        super().__init__(
+            in_channels=latent_channels,
+            model_channels=model_channels,
+            num_blocks=num_blocks,
+            num_heads=num_heads,
+            num_head_channels=num_head_channels,
+            mlp_ratio=mlp_ratio,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            pe_mode=pe_mode,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.resolution = resolution
+        self.rep_config = representation_config
+        self._calc_layout()
+        self.out_layer = sp.SparseLinear(model_channels, self.out_channels)
+        self._build_perturbation()
+        self.initialize_weights()
+        if pretrained_gs_dec is not None:
+            if pretrained_gs_dec.endswith('.pt'):
+                print (f'loading pretrained weight: {pretrained_gs_dec}')
+                model_ckpt = torch.load(pretrained_gs_dec, map_location='cpu', weights_only=True)
+                self.load_state_dict(model_ckpt)
+                del model_ckpt
+            else:
+                print (f'loading pretrained weight: {pretrained_gs_dec}')
+                pre_trained_models: SLatGaussianDecoder
+                pre_trained_models = from_pretrained(pretrained_gs_dec)
+                self.load_state_dict(pre_trained_models.state_dict())
+                del pre_trained_models
+        if use_fp16:
+            self.convert_to_fp16()
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def _build_perturbation(self) -> None:
+        perturbation = [hammersley_sequence(3, i, self.rep_config['num_gaussians']) for i in range(self.rep_config['num_gaussians'])]
+        perturbation = torch.tensor(perturbation).float() * 2 - 1
+        perturbation = perturbation / self.rep_config['voxel_size']
+        perturbation = torch.atanh(perturbation).to(self.device)
+        self.register_buffer('offset_perturbation', perturbation)
+    def _calc_layout(self) -> None:
+        self.layout = {
+            '_xyz' : {'shape': (self.rep_config['num_gaussians'], 3), 'size': self.rep_config['num_gaussians'] * 3},
+            '_features_dc' : {'shape': (self.rep_config['num_gaussians'], 1, 3), 'size': self.rep_config['num_gaussians'] * 3},
+            '_scaling' : {'shape': (self.rep_config['num_gaussians'], 3), 'size': self.rep_config['num_gaussians'] * 3},
+            '_rotation' : {'shape': (self.rep_config['num_gaussians'], 4), 'size': self.rep_config['num_gaussians'] * 4},
+            '_opacity' : {'shape': (self.rep_config['num_gaussians'], 1), 'size': self.rep_config['num_gaussians']},
+        }
+        start = 0
+        for k, v in self.layout.items():
+            v['range'] = (start, start + v['size'])
+            start += v['size']
+        self.out_channels = start
+    def to_representation(self, x: sp.SparseTensor) -> List[Gaussian]:
+        """
+        Convert a batch of network outputs to 3D representations.
+        Args:
+            x: The [N x * x C] sparse tensor output by the network.
+        Returns:
+            list of representations
+        """
+        ret = []
+        for i in range(x.shape[0]):
+            representation = Gaussian(
+                sh_degree=0,
+                aabb=[-0.5, -0.5, -0.5, 1.0, 1.0, 1.0],
+                mininum_kernel_size = self.rep_config['3d_filter_kernel_size'],
+                scaling_bias = self.rep_config['scaling_bias'],
+                opacity_bias = self.rep_config['opacity_bias'],
+                scaling_activation = self.rep_config['scaling_activation']
+            )
+            xyz = (x.coords[x.layout[i]][:, 1:].float() + 0.5) / self.resolution
+            for k, v in self.layout.items():
+                if k == '_xyz':
+                    offset = x.feats[x.layout[i]][:, v['range'][0]:v['range'][1]].reshape(-1, *v['shape'])
+                    offset = offset * self.rep_config['lr'][k]
+                    if self.rep_config['perturb_offset']:
+                        offset = offset + self.offset_perturbation
+                    offset = torch.tanh(offset) / self.resolution * 0.5 * self.rep_config['voxel_size']
+                    _xyz = xyz.unsqueeze(1) + offset
+                    setattr(representation, k, _xyz.flatten(0, 1))
+                else:
+                    feats = x.feats[x.layout[i]][:, v['range'][0]:v['range'][1]].reshape(-1, *v['shape']).flatten(0, 1)
+                    feats = feats * self.rep_config['lr'][k]
+                    setattr(representation, k, feats)
+            ret.append(representation)
+        return ret
+    def forward(self, x: sp.SparseTensor) -> List[Gaussian]:
+        h = super().forward(x)
+        h = h.type(x.dtype)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = self.out_layer(h)
+        return self.to_representation(h)
+class ElasticSLatGaussianDecoder(SparseTransformerElasticMixin, SLatGaussianDecoder):
+    """
+    Slat VAE Gaussian decoder with elastic memory management.
+    Used for training with low VRAM.
+    """
+    pass

threeDFixer/models/structured_latent_vae/decoder_mesh.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from ...modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+from ...modules import sparse as sp
+from .base import SparseTransformerBase
+from ...representations import MeshExtractResult
+from ...representations.mesh import SparseFeatures2Mesh
+from ..sparse_elastic_mixin import SparseTransformerElasticMixin
+from .. import from_pretrained
+class SparseSubdivideBlock3d(nn.Module):
+    """
+    A 3D subdivide block that can subdivide the sparse tensor.
+    Args:
+        channels: channels in the inputs and outputs.
+        out_channels: if specified, the number of output channels.
+        num_groups: the number of groups for the group norm.
+    """
+    def __init__(
+        self,
+        channels: int,
+        resolution: int,
+        out_channels: Optional[int] = None,
+        num_groups: int = 32
+    ):
+        super().__init__()
+        self.channels = channels
+        self.resolution = resolution
+        self.out_resolution = resolution * 2
+        self.out_channels = out_channels or channels
+        self.act_layers = nn.Sequential(
+            sp.SparseGroupNorm32(num_groups, channels),
+            sp.SparseSiLU()
+        )
+        self.sub = sp.SparseSubdivide()
+        self.out_layers = nn.Sequential(
+            sp.SparseConv3d(channels, self.out_channels, 3, indice_key=f"res_{self.out_resolution}"),
+            sp.SparseGroupNorm32(num_groups, self.out_channels),
+            sp.SparseSiLU(),
+            zero_module(sp.SparseConv3d(self.out_channels, self.out_channels, 3, indice_key=f"res_{self.out_resolution}")),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        else:
+            self.skip_connection = sp.SparseConv3d(channels, self.out_channels, 1, indice_key=f"res_{self.out_resolution}")
+    def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        Args:
+            x: an [N x C x ...] Tensor of features.
+        Returns:
+            an [N x C x ...] Tensor of outputs.
+        """
+        h = self.act_layers(x)
+        h = self.sub(h)
+        x = self.sub(x)
+        h = self.out_layers(h)
+        h = h + self.skip_connection(x)
+        return h
+class SLatMeshDecoder(SparseTransformerBase):
+    def __init__(
+        self,
+        resolution: int,
+        model_channels: int,
+        latent_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
+        window_size: int = 8,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+        representation_config: dict = None,
+        pretrained_mesh_dec: str = None,
+    ):
+        super().__init__(
+            in_channels=latent_channels,
+            model_channels=model_channels,
+            num_blocks=num_blocks,
+            num_heads=num_heads,
+            num_head_channels=num_head_channels,
+            mlp_ratio=mlp_ratio,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            pe_mode=pe_mode,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.resolution = resolution
+        self.rep_config = representation_config
+        self.mesh_extractor = SparseFeatures2Mesh(res=self.resolution*4, use_color=self.rep_config.get('use_color', False))
+        self.out_channels = self.mesh_extractor.feats_channels
+        self.upsample = nn.ModuleList([
+            SparseSubdivideBlock3d(
+                channels=model_channels,
+                resolution=resolution,
+                out_channels=model_channels // 4
+            ),
+            SparseSubdivideBlock3d(
+                channels=model_channels // 4,
+                resolution=resolution * 2,
+                out_channels=model_channels // 8
+            )
+        ])
+        self.out_layer = sp.SparseLinear(model_channels // 8, self.out_channels)
+        self.initialize_weights()
+        if pretrained_mesh_dec is not None:
+            print (f'loading pretrained weight: {pretrained_mesh_dec}')
+            pre_trained_models: SLatMeshDecoder
+            pre_trained_models = from_pretrained(pretrained_mesh_dec)
+            self.load_state_dict(pre_trained_models.state_dict())
+            del pre_trained_models
+        if use_fp16:
+            self.convert_to_fp16()
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        super().convert_to_fp16()
+        self.upsample.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        super().convert_to_fp32()
+        self.upsample.apply(convert_module_to_f32)
+    def to_representation(self, x: sp.SparseTensor) -> List[MeshExtractResult]:
+        """
+        Convert a batch of network outputs to 3D representations.
+        Args:
+            x: The [N x * x C] sparse tensor output by the network.
+        Returns:
+            list of representations
+        """
+        ret = []
+        for i in range(x.shape[0]):
+            mesh = self.mesh_extractor(x[i], training=self.training)
+            ret.append(mesh)
+        return ret
+    def forward(self, x: sp.SparseTensor) -> List[MeshExtractResult]:
+        h = super().forward(x)
+        for block in self.upsample:
+            h = block(h)
+        h = h.type(x.dtype)
+        h = self.out_layer(h)
+        return self.to_representation(h)
+class ElasticSLatMeshDecoder(SparseTransformerElasticMixin, SLatMeshDecoder):
+    """
+    Slat VAE Mesh decoder with elastic memory management.
+    Used for training with low VRAM.
+    """
+    pass

threeDFixer/models/structured_latent_vae/decoder_rf.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from ...modules import sparse as sp
+from .base import SparseTransformerBase
+from ...representations import Strivec
+from ..sparse_elastic_mixin import SparseTransformerElasticMixin
+class SLatRadianceFieldDecoder(SparseTransformerBase):
+    def __init__(
+        self,
+        resolution: int,
+        model_channels: int,
+        latent_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
+        window_size: int = 8,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+        representation_config: dict = None,
+    ):
+        super().__init__(
+            in_channels=latent_channels,
+            model_channels=model_channels,
+            num_blocks=num_blocks,
+            num_heads=num_heads,
+            num_head_channels=num_head_channels,
+            mlp_ratio=mlp_ratio,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            pe_mode=pe_mode,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.resolution = resolution
+        self.rep_config = representation_config
+        self._calc_layout()
+        self.out_layer = sp.SparseLinear(model_channels, self.out_channels)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def _calc_layout(self) -> None:
+        self.layout = {
+            'trivec': {'shape': (self.rep_config['rank'], 3, self.rep_config['dim']), 'size': self.rep_config['rank'] * 3 * self.rep_config['dim']},
+            'density': {'shape': (self.rep_config['rank'],), 'size': self.rep_config['rank']},
+            'features_dc': {'shape': (self.rep_config['rank'], 1, 3), 'size': self.rep_config['rank'] * 3},
+        }
+        start = 0
+        for k, v in self.layout.items():
+            v['range'] = (start, start + v['size'])
+            start += v['size']
+        self.out_channels = start
+    def to_representation(self, x: sp.SparseTensor) -> List[Strivec]:
+        """
+        Convert a batch of network outputs to 3D representations.
+        Args:
+            x: The [N x * x C] sparse tensor output by the network.
+        Returns:
+            list of representations
+        """
+        ret = []
+        for i in range(x.shape[0]):
+            representation = Strivec(
+                sh_degree=0,
+                resolution=self.resolution,
+                aabb=[-0.5, -0.5, -0.5, 1, 1, 1],
+                rank=self.rep_config['rank'],
+                dim=self.rep_config['dim'],
+                device='cuda',
+            )
+            representation.density_shift = 0.0
+            representation.position = (x.coords[x.layout[i]][:, 1:].float() + 0.5) / self.resolution
+            representation.depth = torch.full((representation.position.shape[0], 1), int(np.log2(self.resolution)), dtype=torch.uint8, device='cuda')
+            for k, v in self.layout.items():
+                setattr(representation, k, x.feats[x.layout[i]][:, v['range'][0]:v['range'][1]].reshape(-1, *v['shape']))
+            representation.trivec = representation.trivec + 1
+            ret.append(representation)
+        return ret
+    def forward(self, x: sp.SparseTensor) -> List[Strivec]:
+        h = super().forward(x)
+        h = h.type(x.dtype)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = self.out_layer(h)
+        return self.to_representation(h)
+class ElasticSLatRadianceFieldDecoder(SparseTransformerElasticMixin, SLatRadianceFieldDecoder):
+    """
+    Slat VAE Radiance Field Decoder with elastic memory management.
+    Used for training with low VRAM.
+    """
+    pass

threeDFixer/models/structured_latent_vae/encoder.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...modules import sparse as sp
+from .base import SparseTransformerBase
+from ..sparse_elastic_mixin import SparseTransformerElasticMixin
+from .. import from_pretrained
+class SLatEncoder(SparseTransformerBase):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        model_channels: int,
+        latent_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
+        window_size: int = 8,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+        pretrained_slat_enc: str = None,
+    ):
+        super().__init__(
+            in_channels=in_channels,
+            model_channels=model_channels,
+            num_blocks=num_blocks,
+            num_heads=num_heads,
+            num_head_channels=num_head_channels,
+            mlp_ratio=mlp_ratio,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            pe_mode=pe_mode,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.resolution = resolution
+        self.out_layer = sp.SparseLinear(model_channels, 2 * latent_channels)
+        self.initialize_weights()
+        if pretrained_slat_enc is not None:
+            print (f'loading pretrained weight: {pretrained_slat_enc}')
+            pre_trained_models: SLatEncoder
+            pre_trained_models = from_pretrained(pretrained_slat_enc)
+            self.load_state_dict(pre_trained_models.state_dict())
+            del pre_trained_models
+        if use_fp16:
+            self.convert_to_fp16()
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def forward(self, x: sp.SparseTensor, sample_posterior=True, return_raw=False):
+        h = super().forward(x)
+        h = h.type(x.dtype)
+        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+        h = self.out_layer(h)
+        # Sample from the posterior distribution
+        mean, logvar = h.feats.chunk(2, dim=-1)
+        if sample_posterior:
+            std = torch.exp(0.5 * logvar)
+            z = mean + std * torch.randn_like(std)
+        else:
+            z = mean
+        z = h.replace(z)
+        if return_raw:
+            return z, mean, logvar
+        else:
+            return z
+class ElasticSLatEncoder(SparseTransformerElasticMixin, SLatEncoder):
+    """
+    SLat VAE encoder with elastic memory management.
+    Used for training with low VRAM.
+    """

threeDFixer/modules/attention/__init__.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+BACKEND = 'flash_attn'
+DEBUG = False
+def __from_env():
+    import os
+    global BACKEND
+    global DEBUG
+    env_attn_backend = os.environ.get('ATTN_BACKEND')
+    env_sttn_debug = os.environ.get('ATTN_DEBUG')
+    if env_attn_backend is not None and env_attn_backend in ['xformers', 'flash_attn', 'sdpa', 'naive']:
+        BACKEND = env_attn_backend
+    if env_sttn_debug is not None:
+        DEBUG = env_sttn_debug == '1'
+    print(f"[ATTENTION] Using backend: {BACKEND}")
+__from_env()
+def set_backend(backend: Literal['xformers', 'flash_attn']):
+    global BACKEND
+    BACKEND = backend
+def set_debug(debug: bool):
+    global DEBUG
+    DEBUG = debug
+from .full_attn import *
+from .modules import *

threeDFixer/modules/attention/full_attn.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+import torch
+import math
+from . import DEBUG, BACKEND
+if BACKEND == 'xformers':
+    import xformers.ops as xops
+elif BACKEND == 'flash_attn':
+    import flash_attn
+elif BACKEND == 'sdpa':
+    from torch.nn.functional import scaled_dot_product_attention as sdpa
+elif BACKEND == 'naive':
+    pass
+else:
+    raise ValueError(f"Unknown attention backend: {BACKEND}")
+__all__ = [
+    'scaled_dot_product_attention',
+]
+def _naive_sdpa(q, k, v):
+    """
+    Naive implementation of scaled dot product attention.
+    """
+    q = q.permute(0, 2, 1, 3)   # [N, H, L, C]
+    k = k.permute(0, 2, 1, 3)   # [N, H, L, C]
+    v = v.permute(0, 2, 1, 3)   # [N, H, L, C]
+    scale_factor = 1 / math.sqrt(q.size(-1))
+    attn_weight = q @ k.transpose(-2, -1) * scale_factor
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    out = attn_weight @ v
+    out = out.permute(0, 2, 1, 3)   # [N, L, H, C]
+    return out
+@overload
+def scaled_dot_product_attention(qkv: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        qkv (torch.Tensor): A [N, L, 3, H, C] tensor containing Qs, Ks, and Vs.
+    """
+    ...
+@overload
+def scaled_dot_product_attention(q: torch.Tensor, kv: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        q (torch.Tensor): A [N, L, H, C] tensor containing Qs.
+        kv (torch.Tensor): A [N, L, 2, H, C] tensor containing Ks and Vs.
+    """
+    ...
+@overload
+def scaled_dot_product_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        q (torch.Tensor): A [N, L, H, Ci] tensor containing Qs.
+        k (torch.Tensor): A [N, L, H, Ci] tensor containing Ks.
+        v (torch.Tensor): A [N, L, H, Co] tensor containing Vs.
+    Note:
+        k and v are assumed to have the same coordinate map.
+    """
+    ...
+def scaled_dot_product_attention(*args, **kwargs):
+    arg_names_dict = {
+        1: ['qkv'],
+        2: ['q', 'kv'],
+        3: ['q', 'k', 'v']
+    }
+    num_all_args = len(args) + len(kwargs)
+    assert num_all_args in arg_names_dict, f"Invalid number of arguments, got {num_all_args}, expected 1, 2, or 3"
+    for key in arg_names_dict[num_all_args][len(args):]:
+        assert key in kwargs, f"Missing argument {key}"
+    if num_all_args == 1:
+        qkv = args[0] if len(args) > 0 else kwargs['qkv']
+        assert len(qkv.shape) == 5 and qkv.shape[2] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, L, 3, H, C]"
+        device = qkv.device
+    elif num_all_args == 2:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        kv = args[1] if len(args) > 1 else kwargs['kv']
+        assert q.shape[0] == kv.shape[0], f"Batch size mismatch, got {q.shape[0]} and {kv.shape[0]}"
+        assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, C]"
+        assert len(kv.shape) == 5, f"Invalid shape for kv, got {kv.shape}, expected [N, L, 2, H, C]"
+        device = q.device
+    elif num_all_args == 3:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        k = args[1] if len(args) > 1 else kwargs['k']
+        v = args[2] if len(args) > 2 else kwargs['v']
+        assert q.shape[0] == k.shape[0] == v.shape[0], f"Batch size mismatch, got {q.shape[0]}, {k.shape[0]}, and {v.shape[0]}"
+        assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, Ci]"
+        assert len(k.shape) == 4, f"Invalid shape for k, got {k.shape}, expected [N, L, H, Ci]"
+        assert len(v.shape) == 4, f"Invalid shape for v, got {v.shape}, expected [N, L, H, Co]"
+        device = q.device
+    if BACKEND == 'xformers':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        out = xops.memory_efficient_attention(q, k, v)
+    elif BACKEND == 'flash_attn':
+        if num_all_args == 1:
+            out = flash_attn.flash_attn_qkvpacked_func(qkv)
+        elif num_all_args == 2:
+            out = flash_attn.flash_attn_kvpacked_func(q, kv)
+        elif num_all_args == 3:
+            out = flash_attn.flash_attn_func(q, k, v)
+    elif BACKEND == 'sdpa':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        q = q.permute(0, 2, 1, 3)   # [N, H, L, C]
+        k = k.permute(0, 2, 1, 3)   # [N, H, L, C]
+        v = v.permute(0, 2, 1, 3)   # [N, H, L, C]
+        out = sdpa(q, k, v)         # [N, H, L, C]
+        out = out.permute(0, 2, 1, 3)   # [N, L, H, C]
+    elif BACKEND == 'naive':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        out = _naive_sdpa(q, k, v)
+    else:
+        raise ValueError(f"Unknown attention module: {BACKEND}")
+    return out

threeDFixer/modules/attention/modules.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .full_attn import scaled_dot_product_attention
+class MultiHeadRMSNorm(nn.Module):
+    def __init__(self, dim: int, heads: int):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(heads, dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return (F.normalize(x.float(), dim = -1) * self.gamma * self.scale).to(x.dtype)
+class RotaryPositionEmbedder(nn.Module):
+    def __init__(self, hidden_size: int, in_channels: int = 3):
+        super().__init__()
+        assert hidden_size % 2 == 0, "Hidden size must be divisible by 2"
+        self.hidden_size = hidden_size
+        self.in_channels = in_channels
+        self.freq_dim = hidden_size // in_channels // 2
+        self.freqs = torch.arange(self.freq_dim, dtype=torch.float32) / self.freq_dim
+        self.freqs = 1.0 / (10000 ** self.freqs)
+    def _get_phases(self, indices: torch.Tensor) -> torch.Tensor:
+        self.freqs = self.freqs.to(indices.device)
+        phases = torch.outer(indices, self.freqs)
+        phases = torch.polar(torch.ones_like(phases), phases)
+        return phases
+    def _rotary_embedding(self, x: torch.Tensor, phases: torch.Tensor) -> torch.Tensor:
+        x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        x_rotated = x_complex * phases
+        x_embed = torch.view_as_real(x_rotated).reshape(*x_rotated.shape[:-1], -1).to(x.dtype)
+        return x_embed
+    def forward(self, q: torch.Tensor, k: torch.Tensor, indices: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            q (sp.SparseTensor): [..., N, D] tensor of queries
+            k (sp.SparseTensor): [..., N, D] tensor of keys
+            indices (torch.Tensor): [..., N, C] tensor of spatial positions
+        """
+        if indices is None:
+            indices = torch.arange(q.shape[-2], device=q.device)
+            if len(q.shape) > 2:
+                indices = indices.unsqueeze(0).expand(q.shape[:-2] + (-1,))
+        phases = self._get_phases(indices.reshape(-1)).reshape(*indices.shape[:-1], -1)
+        if phases.shape[1] < self.hidden_size // 2:
+            phases = torch.cat([phases, torch.polar(
+                torch.ones(*phases.shape[:-1], self.hidden_size // 2 - phases.shape[1], device=phases.device),
+                torch.zeros(*phases.shape[:-1], self.hidden_size // 2 - phases.shape[1], device=phases.device)
+            )], dim=-1)
+        q_embed = self._rotary_embedding(q, phases)
+        k_embed = self._rotary_embedding(k, phases)
+        return q_embed, k_embed
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        ctx_channels: Optional[int]=None,
+        type: Literal["self", "cross"] = "self",
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        qkv_bias: bool = True,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__()
+        assert channels % num_heads == 0
+        assert type in ["self", "cross"], f"Invalid attention type: {type}"
+        assert attn_mode in ["full", "windowed"], f"Invalid attention mode: {attn_mode}"
+        assert type == "self" or attn_mode == "full", "Cross-attention only supports full attention"
+        if attn_mode == "windowed":
+            raise NotImplementedError("Windowed attention is not yet implemented")
+        self.channels = channels
+        self.head_dim = channels // num_heads
+        self.ctx_channels = ctx_channels if ctx_channels is not None else channels
+        self.num_heads = num_heads
+        self._type = type
+        self.attn_mode = attn_mode
+        self.window_size = window_size
+        self.shift_window = shift_window
+        self.use_rope = use_rope
+        self.qk_rms_norm = qk_rms_norm
+        if self._type == "self":
+            self.to_qkv = nn.Linear(channels, channels * 3, bias=qkv_bias)
+        else:
+            self.to_q = nn.Linear(channels, channels, bias=qkv_bias)
+            self.to_kv = nn.Linear(self.ctx_channels, channels * 2, bias=qkv_bias)
+        if self.qk_rms_norm:
+            self.q_rms_norm = MultiHeadRMSNorm(self.head_dim, num_heads)
+            self.k_rms_norm = MultiHeadRMSNorm(self.head_dim, num_heads)
+        self.to_out = nn.Linear(channels, channels)
+        if use_rope:
+            self.rope = RotaryPositionEmbedder(channels)
+    def forward(self, x: torch.Tensor, context: Optional[torch.Tensor] = None, indices: Optional[torch.Tensor] = None) -> torch.Tensor:
+        B, L, C = x.shape
+        if self._type == "self":
+            qkv = self.to_qkv(x)
+            qkv = qkv.reshape(B, L, 3, self.num_heads, -1)
+            if self.use_rope:
+                q, k, v = qkv.unbind(dim=2)
+                q, k = self.rope(q, k, indices)
+                qkv = torch.stack([q, k, v], dim=2)
+            if self.attn_mode == "full":
+                if self.qk_rms_norm:
+                    q, k, v = qkv.unbind(dim=2)
+                    q = self.q_rms_norm(q)
+                    k = self.k_rms_norm(k)
+                    h = scaled_dot_product_attention(q, k, v)
+                else:
+                    h = scaled_dot_product_attention(qkv)
+            elif self.attn_mode == "windowed":
+                raise NotImplementedError("Windowed attention is not yet implemented")
+        else:
+            Lkv = context.shape[1]
+            q = self.to_q(x)
+            kv = self.to_kv(context)
+            q = q.reshape(B, L, self.num_heads, -1)
+            kv = kv.reshape(B, Lkv, 2, self.num_heads, -1)
+            if self.qk_rms_norm:
+                q = self.q_rms_norm(q)
+                k, v = kv.unbind(dim=2)
+                k = self.k_rms_norm(k)
+                h = scaled_dot_product_attention(q, k, v)
+            else:
+                h = scaled_dot_product_attention(q, kv)
+        h = h.reshape(B, L, -1)
+        h = self.to_out(h)
+        return h

threeDFixer/modules/norm.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+import torch
+import torch.nn as nn
+class LayerNorm32(nn.LayerNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class GroupNorm32(nn.GroupNorm):
+    """
+    A GroupNorm layer that converts to float32 before the forward pass.
+    """
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class ChannelLayerNorm32(LayerNorm32):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        DIM = x.dim()
+        x = x.permute(0, *range(2, DIM), 1).contiguous()
+        x = super().forward(x)
+        x = x.permute(0, DIM-1, *range(1, DIM-1)).contiguous()
+        return x

threeDFixer/modules/sparse/__init__.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from typing import *
+BACKEND = 'spconv'
+DEBUG = False
+ATTN = 'flash_attn'
+def __from_env():
+    import os
+    global BACKEND
+    global DEBUG
+    global ATTN
+    env_sparse_backend = os.environ.get('SPARSE_BACKEND')
+    env_sparse_debug = os.environ.get('SPARSE_DEBUG')
+    env_sparse_attn = os.environ.get('SPARSE_ATTN_BACKEND')
+    if env_sparse_attn is None:
+        env_sparse_attn = os.environ.get('ATTN_BACKEND')
+    if env_sparse_backend is not None and env_sparse_backend in ['spconv', 'torchsparse']:
+        BACKEND = env_sparse_backend
+    if env_sparse_debug is not None:
+        DEBUG = env_sparse_debug == '1'
+    if env_sparse_attn is not None and env_sparse_attn in ['xformers', 'flash_attn']:
+        ATTN = env_sparse_attn
+    print(f"[SPARSE] Backend: {BACKEND}, Attention: {ATTN}")
+__from_env()
+def set_backend(backend: Literal['spconv', 'torchsparse']):
+    global BACKEND
+    BACKEND = backend
+def set_debug(debug: bool):
+    global DEBUG
+    DEBUG = debug
+def set_attn(attn: Literal['xformers', 'flash_attn']):
+    global ATTN
+    ATTN = attn
+import importlib
+__attributes = {
+    'SparseTensor': 'basic',
+    'sparse_batch_broadcast': 'basic',
+    'sparse_batch_op': 'basic',
+    'sparse_cat': 'basic',
+    'sparse_unbind': 'basic',
+    'SparseGroupNorm': 'norm',
+    'SparseLayerNorm': 'norm',
+    'SparseGroupNorm32': 'norm',
+    'SparseLayerNorm32': 'norm',
+    'SparseReLU': 'nonlinearity',
+    'SparseSiLU': 'nonlinearity',
+    'SparseGELU': 'nonlinearity',
+    'SparseActivation': 'nonlinearity',
+    'SparseLinear': 'linear',
+    'sparse_scaled_dot_product_attention': 'attention',
+    'SerializeMode': 'attention',
+    'sparse_serialized_scaled_dot_product_self_attention': 'attention',
+    'sparse_windowed_scaled_dot_product_self_attention': 'attention',
+    'SparseMultiHeadAttention': 'attention',
+    'SparseConv3d': 'conv',
+    'SparseInverseConv3d': 'conv',
+    'SparseDownsample': 'spatial',
+    'SparseUpsample': 'spatial',
+    'SparseSubdivide' : 'spatial'
+}
+__submodules = ['transformer']
+__all__ = list(__attributes.keys()) + __submodules
+def __getattr__(name):
+    if name not in globals():
+        if name in __attributes:
+            module_name = __attributes[name]
+            module = importlib.import_module(f".{module_name}", __name__)
+            globals()[name] = getattr(module, name)
+        elif name in __submodules:
+            module = importlib.import_module(f".{name}", __name__)
+            globals()[name] = module
+        else:
+            raise AttributeError(f"module {__name__} has no attribute {name}")
+    return globals()[name]
+# For Pylance
+if __name__ == '__main__':
+    from .basic import *
+    from .norm import *
+    from .nonlinearity import *
+    from .linear import *
+    from .attention import *
+    from .conv import *
+    from .spatial import *
+    import transformer

threeDFixer/modules/sparse/attention/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from .full_attn import *
+from .serialized_attn import *
+from .windowed_attn import *
+from .modules import *

threeDFixer/modules/sparse/attention/full_attn.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+import torch
+from .. import SparseTensor
+from .. import DEBUG, ATTN
+if ATTN == 'xformers':
+    import xformers.ops as xops
+elif ATTN == 'flash_attn':
+    import flash_attn
+else:
+    raise ValueError(f"Unknown attention module: {ATTN}")
+__all__ = [
+    'sparse_scaled_dot_product_attention',
+]
+@overload
+def sparse_scaled_dot_product_attention(qkv: SparseTensor) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        qkv (SparseTensor): A [N, *, 3, H, C] sparse tensor containing Qs, Ks, and Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: SparseTensor, kv: Union[SparseTensor, torch.Tensor]) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, *, H, C] sparse tensor containing Qs.
+        kv (SparseTensor or torch.Tensor): A [N, *, 2, H, C] sparse tensor or a [N, L, 2, H, C] dense tensor containing Ks and Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: torch.Tensor, kv: SparseTensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, L, H, C] dense tensor containing Qs.
+        kv (SparseTensor or torch.Tensor): A [N, *, 2, H, C] sparse tensor containing Ks and Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: SparseTensor, k: SparseTensor, v: SparseTensor) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, *, H, Ci] sparse tensor containing Qs.
+        k (SparseTensor): A [N, *, H, Ci] sparse tensor containing Ks.
+        v (SparseTensor): A [N, *, H, Co] sparse tensor containing Vs.
+    Note:
+        k and v are assumed to have the same coordinate map.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: SparseTensor, k: torch.Tensor, v: torch.Tensor) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, *, H, Ci] sparse tensor containing Qs.
+        k (torch.Tensor): A [N, L, H, Ci] dense tensor containing Ks.
+        v (torch.Tensor): A [N, L, H, Co] dense tensor containing Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: torch.Tensor, k: SparseTensor, v: SparseTensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (torch.Tensor): A [N, L, H, Ci] dense tensor containing Qs.
+        k (SparseTensor): A [N, *, H, Ci] sparse tensor containing Ks.
+        v (SparseTensor): A [N, *, H, Co] sparse tensor containing Vs.
+    """
+    ...
+def sparse_scaled_dot_product_attention(*args, **kwargs):
+    arg_names_dict = {
+        1: ['qkv'],
+        2: ['q', 'kv'],
+        3: ['q', 'k', 'v']
+    }
+    num_all_args = len(args) + len(kwargs)
+    assert num_all_args in arg_names_dict, f"Invalid number of arguments, got {num_all_args}, expected 1, 2, or 3"
+    for key in arg_names_dict[num_all_args][len(args):]:
+        assert key in kwargs, f"Missing argument {key}"
+    if num_all_args == 1:
+        qkv = args[0] if len(args) > 0 else kwargs['qkv']
+        assert isinstance(qkv, SparseTensor), f"qkv must be a SparseTensor, got {type(qkv)}"
+        assert len(qkv.shape) == 4 and qkv.shape[1] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, *, 3, H, C]"
+        device = qkv.device
+        s = qkv
+        q_seqlen = [qkv.layout[i].stop - qkv.layout[i].start for i in range(qkv.shape[0])]
+        kv_seqlen = q_seqlen
+        qkv = qkv.feats     # [T, 3, H, C]
+    elif num_all_args == 2:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        kv = args[1] if len(args) > 1 else kwargs['kv']
+        assert isinstance(q, SparseTensor) and isinstance(kv, (SparseTensor, torch.Tensor)) or \
+               isinstance(q, torch.Tensor) and isinstance(kv, SparseTensor), \
+               f"Invalid types, got {type(q)} and {type(kv)}"
+        assert q.shape[0] == kv.shape[0], f"Batch size mismatch, got {q.shape[0]} and {kv.shape[0]}"
+        device = q.device
+        if isinstance(q, SparseTensor):
+            assert len(q.shape) == 3, f"Invalid shape for q, got {q.shape}, expected [N, *, H, C]"
+            s = q
+            q_seqlen = [q.layout[i].stop - q.layout[i].start for i in range(q.shape[0])]
+            q = q.feats     # [T_Q, H, C]
+        else:
+            assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, C]"
+            s = None
+            N, L, H, C = q.shape
+            q_seqlen = [L] * N
+            q = q.reshape(N * L, H, C)   # [T_Q, H, C]
+        if isinstance(kv, SparseTensor):
+            assert len(kv.shape) == 4 and kv.shape[1] == 2, f"Invalid shape for kv, got {kv.shape}, expected [N, *, 2, H, C]"
+            kv_seqlen = [kv.layout[i].stop - kv.layout[i].start for i in range(kv.shape[0])]
+            kv = kv.feats     # [T_KV, 2, H, C]
+        else:
+            assert len(kv.shape) == 5, f"Invalid shape for kv, got {kv.shape}, expected [N, L, 2, H, C]"
+            N, L, _, H, C = kv.shape
+            kv_seqlen = [L] * N
+            kv = kv.reshape(N * L, 2, H, C)   # [T_KV, 2, H, C]
+    elif num_all_args == 3:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        k = args[1] if len(args) > 1 else kwargs['k']
+        v = args[2] if len(args) > 2 else kwargs['v']
+        assert isinstance(q, SparseTensor) and isinstance(k, (SparseTensor, torch.Tensor)) and type(k) == type(v) or \
+               isinstance(q, torch.Tensor) and isinstance(k, SparseTensor) and isinstance(v, SparseTensor), \
+               f"Invalid types, got {type(q)}, {type(k)}, and {type(v)}"
+        assert q.shape[0] == k.shape[0] == v.shape[0], f"Batch size mismatch, got {q.shape[0]}, {k.shape[0]}, and {v.shape[0]}"
+        device = q.device
+        if isinstance(q, SparseTensor):
+            assert len(q.shape) == 3, f"Invalid shape for q, got {q.shape}, expected [N, *, H, Ci]"
+            s = q
+            q_seqlen = [q.layout[i].stop - q.layout[i].start for i in range(q.shape[0])]
+            q = q.feats     # [T_Q, H, Ci]
+        else:
+            assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, Ci]"
+            s = None
+            N, L, H, CI = q.shape
+            q_seqlen = [L] * N
+            q = q.reshape(N * L, H, CI)  # [T_Q, H, Ci]
+        if isinstance(k, SparseTensor):
+            assert len(k.shape) == 3, f"Invalid shape for k, got {k.shape}, expected [N, *, H, Ci]"
+            assert len(v.shape) == 3, f"Invalid shape for v, got {v.shape}, expected [N, *, H, Co]"
+            kv_seqlen = [k.layout[i].stop - k.layout[i].start for i in range(k.shape[0])]
+            k = k.feats     # [T_KV, H, Ci]
+            v = v.feats     # [T_KV, H, Co]
+        else:
+            assert len(k.shape) == 4, f"Invalid shape for k, got {k.shape}, expected [N, L, H, Ci]"
+            assert len(v.shape) == 4, f"Invalid shape for v, got {v.shape}, expected [N, L, H, Co]"
+            N, L, H, CI, CO = *k.shape, v.shape[-1]
+            kv_seqlen = [L] * N
+            k = k.reshape(N * L, H, CI)     # [T_KV, H, Ci]
+            v = v.reshape(N * L, H, CO)     # [T_KV, H, Co]
+    if DEBUG:
+        if s is not None:
+            for i in range(s.shape[0]):
+                assert (s.coords[s.layout[i]] == i).all(), f"SparseScaledDotProductSelfAttention: batch index mismatch"
+        if num_all_args in [2, 3]:
+            assert q.shape[:2] == [1, sum(q_seqlen)], f"SparseScaledDotProductSelfAttention: q shape mismatch"
+        if num_all_args == 3:
+            assert k.shape[:2] == [1, sum(kv_seqlen)], f"SparseScaledDotProductSelfAttention: k shape mismatch"
+            assert v.shape[:2] == [1, sum(kv_seqlen)], f"SparseScaledDotProductSelfAttention: v shape mismatch"
+    if ATTN == 'xformers':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=1)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=1)
+        q = q.unsqueeze(0)
+        k = k.unsqueeze(0)
+        v = v.unsqueeze(0)
+        mask = xops.fmha.BlockDiagonalMask.from_seqlens(q_seqlen, kv_seqlen)
+        out = xops.memory_efficient_attention(q, k, v, mask)[0]
+    elif ATTN == 'flash_attn':
+        cu_seqlens_q = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(q_seqlen), dim=0)]).int().to(device)
+        if num_all_args in [2, 3]:
+            cu_seqlens_kv = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(kv_seqlen), dim=0)]).int().to(device)
+        if num_all_args == 1:
+            out = flash_attn.flash_attn_varlen_qkvpacked_func(qkv, cu_seqlens_q, max(q_seqlen))
+        elif num_all_args == 2:
+            out = flash_attn.flash_attn_varlen_kvpacked_func(q, kv, cu_seqlens_q, cu_seqlens_kv, max(q_seqlen), max(kv_seqlen))
+        elif num_all_args == 3:
+            out = flash_attn.flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max(q_seqlen), max(kv_seqlen))
+    else:
+        raise ValueError(f"Unknown attention module: {ATTN}")
+    if s is not None:
+        return s.replace(out)
+    else:
+        return out.reshape(N, L, H, -1)

threeDFixer/modules/sparse/attention/modules.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .. import SparseTensor
+from .full_attn import sparse_scaled_dot_product_attention
+from .serialized_attn import SerializeMode, sparse_serialized_scaled_dot_product_self_attention
+from .windowed_attn import sparse_windowed_scaled_dot_product_self_attention
+from ...attention import RotaryPositionEmbedder
+class SparseMultiHeadRMSNorm(nn.Module):
+    def __init__(self, dim: int, heads: int):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(heads, dim))
+    def forward(self, x: Union[SparseTensor, torch.Tensor]) -> Union[SparseTensor, torch.Tensor]:
+        x_type = x.dtype
+        x = x.float()
+        if isinstance(x, SparseTensor):
+            x = x.replace(F.normalize(x.feats, dim=-1))
+        else:
+            x = F.normalize(x, dim=-1)
+        return (x * self.gamma * self.scale).to(x_type)
+class SparseMultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        ctx_channels: Optional[int] = None,
+        type: Literal["self", "cross"] = "self",
+        attn_mode: Literal["full", "serialized", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        qkv_bias: bool = True,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__()
+        assert channels % num_heads == 0
+        assert type in ["self", "cross"], f"Invalid attention type: {type}"
+        assert attn_mode in ["full", "serialized", "windowed"], f"Invalid attention mode: {attn_mode}"
+        assert type == "self" or attn_mode == "full", "Cross-attention only supports full attention"
+        assert type == "self" or use_rope is False, "Rotary position embeddings only supported for self-attention"
+        self.channels = channels
+        self.ctx_channels = ctx_channels if ctx_channels is not None else channels
+        self.num_heads = num_heads
+        self._type = type
+        self.attn_mode = attn_mode
+        self.window_size = window_size
+        self.shift_sequence = shift_sequence
+        self.shift_window = shift_window
+        self.serialize_mode = serialize_mode
+        self.use_rope = use_rope
+        self.qk_rms_norm = qk_rms_norm
+        if self._type == "self":
+            self.to_qkv = nn.Linear(channels, channels * 3, bias=qkv_bias)
+        else:
+            self.to_q = nn.Linear(channels, channels, bias=qkv_bias)
+            self.to_kv = nn.Linear(self.ctx_channels, channels * 2, bias=qkv_bias)
+        if self.qk_rms_norm:
+            self.q_rms_norm = SparseMultiHeadRMSNorm(channels // num_heads, num_heads)
+            self.k_rms_norm = SparseMultiHeadRMSNorm(channels // num_heads, num_heads)
+        self.to_out = nn.Linear(channels, channels)
+        if use_rope:
+            self.rope = RotaryPositionEmbedder(channels)
+    @staticmethod
+    def _linear(module: nn.Linear, x: Union[SparseTensor, torch.Tensor]) -> Union[SparseTensor, torch.Tensor]:
+        if isinstance(x, SparseTensor):
+            return x.replace(module(x.feats))
+        else:
+            return module(x)
+    @staticmethod
+    def _reshape_chs(x: Union[SparseTensor, torch.Tensor], shape: Tuple[int, ...]) -> Union[SparseTensor, torch.Tensor]:
+        if isinstance(x, SparseTensor):
+            return x.reshape(*shape)
+        else:
+            return x.reshape(*x.shape[:2], *shape)
+    def _fused_pre(self, x: Union[SparseTensor, torch.Tensor], num_fused: int) -> Union[SparseTensor, torch.Tensor]:
+        if isinstance(x, SparseTensor):
+            x_feats = x.feats.unsqueeze(0)
+        else:
+            x_feats = x
+        x_feats = x_feats.reshape(*x_feats.shape[:2], num_fused, self.num_heads, -1)
+        return x.replace(x_feats.squeeze(0)) if isinstance(x, SparseTensor) else x_feats
+    def _rope(self, qkv: SparseTensor) -> SparseTensor:
+        q, k, v = qkv.feats.unbind(dim=1)   # [T, H, C]
+        q, k = self.rope(q, k, qkv.coords[:, 1:])
+        qkv = qkv.replace(torch.stack([q, k, v], dim=1))
+        return qkv
+    def forward(self, x: Union[SparseTensor, torch.Tensor], context: Optional[Union[SparseTensor, torch.Tensor]] = None) -> Union[SparseTensor, torch.Tensor]:
+        if self._type == "self":
+            qkv = self._linear(self.to_qkv, x)
+            qkv = self._fused_pre(qkv, num_fused=3)
+            if self.use_rope:
+                qkv = self._rope(qkv)
+            if self.qk_rms_norm:
+                q, k, v = qkv.unbind(dim=1)
+                q = self.q_rms_norm(q)
+                k = self.k_rms_norm(k)
+                qkv = qkv.replace(torch.stack([q.feats, k.feats, v.feats], dim=1))
+            if self.attn_mode == "full":
+                h = sparse_scaled_dot_product_attention(qkv)
+            elif self.attn_mode == "serialized":
+                h = sparse_serialized_scaled_dot_product_self_attention(
+                    qkv, self.window_size, serialize_mode=self.serialize_mode, shift_sequence=self.shift_sequence, shift_window=self.shift_window
+                )
+            elif self.attn_mode == "windowed":
+                h = sparse_windowed_scaled_dot_product_self_attention(
+                    qkv, self.window_size, shift_window=self.shift_window
+                )
+        else:
+            q = self._linear(self.to_q, x)
+            q = self._reshape_chs(q, (self.num_heads, -1))
+            kv = self._linear(self.to_kv, context)
+            kv = self._fused_pre(kv, num_fused=2)
+            if self.qk_rms_norm:
+                q = self.q_rms_norm(q)
+                k, v = kv.unbind(dim=1)
+                k = self.k_rms_norm(k)
+                kv = kv.replace(torch.stack([k.feats, v.feats], dim=1))
+            h = sparse_scaled_dot_product_attention(q, kv)
+        h = self._reshape_chs(h, (-1,))
+        h = self._linear(self.to_out, h)
+        return h

threeDFixer/modules/sparse/attention/serialized_attn.py ADDED Viewed

	@@ -0,0 +1,198 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+from enum import Enum
+import torch
+import math
+from .. import SparseTensor
+from .. import DEBUG, ATTN
+if ATTN == 'xformers':
+    import xformers.ops as xops
+elif ATTN == 'flash_attn':
+    import flash_attn
+else:
+    raise ValueError(f"Unknown attention module: {ATTN}")
+__all__ = [
+    'sparse_serialized_scaled_dot_product_self_attention',
+]
+class SerializeMode(Enum):
+    Z_ORDER = 0
+    Z_ORDER_TRANSPOSED = 1
+    HILBERT = 2
+    HILBERT_TRANSPOSED = 3
+SerializeModes = [
+    SerializeMode.Z_ORDER,
+    SerializeMode.Z_ORDER_TRANSPOSED,
+    SerializeMode.HILBERT,
+    SerializeMode.HILBERT_TRANSPOSED
+]
+def calc_serialization(
+    tensor: SparseTensor,
+    window_size: int,
+    serialize_mode: SerializeMode = SerializeMode.Z_ORDER,
+    shift_sequence: int = 0,
+    shift_window: Tuple[int, int, int] = (0, 0, 0)
+) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
+    """
+    Calculate serialization and partitioning for a set of coordinates.
+    Args:
+        tensor (SparseTensor): The input tensor.
+        window_size (int): The window size to use.
+        serialize_mode (SerializeMode): The serialization mode to use.
+        shift_sequence (int): The shift of serialized sequence.
+        shift_window (Tuple[int, int, int]): The shift of serialized coordinates.
+    Returns:
+        (torch.Tensor, torch.Tensor): Forwards and backwards indices.
+    """
+    fwd_indices = []
+    bwd_indices = []
+    seq_lens = []
+    seq_batch_indices = []
+    offsets = [0]
+    if 'vox2seq' not in globals():
+        import vox2seq
+    # Serialize the input
+    serialize_coords = tensor.coords[:, 1:].clone()
+    serialize_coords += torch.tensor(shift_window, dtype=torch.int32, device=tensor.device).reshape(1, 3)
+    if serialize_mode == SerializeMode.Z_ORDER:
+        code = vox2seq.encode(serialize_coords, mode='z_order', permute=[0, 1, 2])
+    elif serialize_mode == SerializeMode.Z_ORDER_TRANSPOSED:
+        code = vox2seq.encode(serialize_coords, mode='z_order', permute=[1, 0, 2])
+    elif serialize_mode == SerializeMode.HILBERT:
+        code = vox2seq.encode(serialize_coords, mode='hilbert', permute=[0, 1, 2])
+    elif serialize_mode == SerializeMode.HILBERT_TRANSPOSED:
+        code = vox2seq.encode(serialize_coords, mode='hilbert', permute=[1, 0, 2])
+    else:
+        raise ValueError(f"Unknown serialize mode: {serialize_mode}")
+    for bi, s in enumerate(tensor.layout):
+        num_points = s.stop - s.start
+        num_windows = (num_points + window_size - 1) // window_size
+        valid_window_size = num_points / num_windows
+        to_ordered = torch.argsort(code[s.start:s.stop])
+        if num_windows == 1:
+            fwd_indices.append(to_ordered)
+            bwd_indices.append(torch.zeros_like(to_ordered).scatter_(0, to_ordered, torch.arange(num_points, device=tensor.device)))
+            fwd_indices[-1] += s.start
+            bwd_indices[-1] += offsets[-1]
+            seq_lens.append(num_points)
+            seq_batch_indices.append(bi)
+            offsets.append(offsets[-1] + seq_lens[-1])
+        else:
+            # Partition the input
+            offset = 0
+            mids = [(i + 0.5) * valid_window_size + shift_sequence for i in range(num_windows)]
+            split = [math.floor(i * valid_window_size + shift_sequence) for i in range(num_windows + 1)]
+            bwd_index = torch.zeros((num_points,), dtype=torch.int64, device=tensor.device)
+            for i in range(num_windows):
+                mid = mids[i]
+                valid_start = split[i]
+                valid_end = split[i + 1]
+                padded_start = math.floor(mid - 0.5 * window_size)
+                padded_end = padded_start + window_size
+                fwd_indices.append(to_ordered[torch.arange(padded_start, padded_end, device=tensor.device) % num_points])
+                offset += valid_start - padded_start
+                bwd_index.scatter_(0, fwd_indices[-1][valid_start-padded_start:valid_end-padded_start], torch.arange(offset, offset + valid_end - valid_start, device=tensor.device))
+                offset += padded_end - valid_start
+                fwd_indices[-1] += s.start
+            seq_lens.extend([window_size] * num_windows)
+            seq_batch_indices.extend([bi] * num_windows)
+            bwd_indices.append(bwd_index + offsets[-1])
+            offsets.append(offsets[-1] + num_windows * window_size)
+    fwd_indices = torch.cat(fwd_indices)
+    bwd_indices = torch.cat(bwd_indices)
+    return fwd_indices, bwd_indices, seq_lens, seq_batch_indices
+def sparse_serialized_scaled_dot_product_self_attention(
+    qkv: SparseTensor,
+    window_size: int,
+    serialize_mode: SerializeMode = SerializeMode.Z_ORDER,
+    shift_sequence: int = 0,
+    shift_window: Tuple[int, int, int] = (0, 0, 0)
+) -> SparseTensor:
+    """
+    Apply serialized scaled dot product self attention to a sparse tensor.
+    Args:
+        qkv (SparseTensor): [N, *, 3, H, C] sparse tensor containing Qs, Ks, and Vs.
+        window_size (int): The window size to use.
+        serialize_mode (SerializeMode): The serialization mode to use.
+        shift_sequence (int): The shift of serialized sequence.
+        shift_window (Tuple[int, int, int]): The shift of serialized coordinates.
+        shift (int): The shift to use.
+    """
+    assert len(qkv.shape) == 4 and qkv.shape[1] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, *, 3, H, C]"
+    serialization_spatial_cache_name = f'serialization_{serialize_mode}_{window_size}_{shift_sequence}_{shift_window}'
+    serialization_spatial_cache = qkv.get_spatial_cache(serialization_spatial_cache_name)
+    if serialization_spatial_cache is None:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = calc_serialization(qkv, window_size, serialize_mode, shift_sequence, shift_window)
+        qkv.register_spatial_cache(serialization_spatial_cache_name, (fwd_indices, bwd_indices, seq_lens, seq_batch_indices))
+    else:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = serialization_spatial_cache
+    M = fwd_indices.shape[0]
+    T = qkv.feats.shape[0]
+    H = qkv.feats.shape[2]
+    C = qkv.feats.shape[3]
+    qkv_feats = qkv.feats[fwd_indices]      # [M, 3, H, C]
+    if DEBUG:
+        start = 0
+        qkv_coords = qkv.coords[fwd_indices]
+        for i in range(len(seq_lens)):
+            assert (qkv_coords[start:start+seq_lens[i], 0] == seq_batch_indices[i]).all(), f"SparseWindowedScaledDotProductSelfAttention: batch index mismatch"
+            start += seq_lens[i]
+    if all([seq_len == window_size for seq_len in seq_lens]):
+        B = len(seq_lens)
+        N = window_size
+        qkv_feats = qkv_feats.reshape(B, N, 3, H, C)
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=2)                       # [B, N, H, C]
+            out = xops.memory_efficient_attention(q, k, v)          # [B, N, H, C]
+        elif ATTN == 'flash_attn':
+            out = flash_attn.flash_attn_qkvpacked_func(qkv_feats)   # [B, N, H, C]
+        else:
+            raise ValueError(f"Unknown attention module: {ATTN}")
+        out = out.reshape(B * N, H, C)                              # [M, H, C]
+    else:
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=1)                       # [M, H, C]
+            q = q.unsqueeze(0)                                      # [1, M, H, C]
+            k = k.unsqueeze(0)                                      # [1, M, H, C]
+            v = v.unsqueeze(0)                                      # [1, M, H, C]
+            mask = xops.fmha.BlockDiagonalMask.from_seqlens(seq_lens)
+            out = xops.memory_efficient_attention(q, k, v, mask)[0] # [M, H, C]
+        elif ATTN == 'flash_attn':
+            cu_seqlens = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(seq_lens), dim=0)], dim=0) \
+                        .to(qkv.device).int()
+            out = flash_attn.flash_attn_varlen_qkvpacked_func(qkv_feats, cu_seqlens, max(seq_lens)) # [M, H, C]
+    out = out[bwd_indices]      # [T, H, C]
+    if DEBUG:
+        qkv_coords = qkv_coords[bwd_indices]
+        assert torch.equal(qkv_coords, qkv.coords), "SparseWindowedScaledDotProductSelfAttention: coordinate mismatch"
+    return qkv.replace(out)

threeDFixer/modules/sparse/attention/windowed_attn.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+import torch
+import math
+from .. import SparseTensor
+from .. import DEBUG, ATTN
+if ATTN == 'xformers':
+    import xformers.ops as xops
+elif ATTN == 'flash_attn':
+    import flash_attn
+else:
+    raise ValueError(f"Unknown attention module: {ATTN}")
+__all__ = [
+    'sparse_windowed_scaled_dot_product_self_attention',
+]
+def calc_window_partition(
+    tensor: SparseTensor,
+    window_size: Union[int, Tuple[int, ...]],
+    shift_window: Union[int, Tuple[int, ...]] = 0
+) -> Tuple[torch.Tensor, torch.Tensor, List[int], List[int]]:
+    """
+    Calculate serialization and partitioning for a set of coordinates.
+    Args:
+        tensor (SparseTensor): The input tensor.
+        window_size (int): The window size to use.
+        shift_window (Tuple[int, ...]): The shift of serialized coordinates.
+    Returns:
+        (torch.Tensor): Forwards indices.
+        (torch.Tensor): Backwards indices.
+        (List[int]): Sequence lengths.
+        (List[int]): Sequence batch indices.
+    """
+    DIM = tensor.coords.shape[1] - 1
+    shift_window = (shift_window,) * DIM if isinstance(shift_window, int) else shift_window
+    window_size = (window_size,) * DIM if isinstance(window_size, int) else window_size
+    shifted_coords = tensor.coords.clone().detach()
+    shifted_coords[:, 1:] += torch.tensor(shift_window, device=tensor.device, dtype=torch.int32).unsqueeze(0)
+    MAX_COORDS = shifted_coords[:, 1:].max(dim=0).values.tolist()
+    NUM_WINDOWS = [math.ceil((mc + 1) / ws) for mc, ws in zip(MAX_COORDS, window_size)]
+    OFFSET = torch.cumprod(torch.tensor([1] + NUM_WINDOWS[::-1]), dim=0).tolist()[::-1]
+    shifted_coords[:, 1:] //= torch.tensor(window_size, device=tensor.device, dtype=torch.int32).unsqueeze(0)
+    shifted_indices = (shifted_coords * torch.tensor(OFFSET, device=tensor.device, dtype=torch.int32).unsqueeze(0)).sum(dim=1)
+    fwd_indices = torch.argsort(shifted_indices)
+    bwd_indices = torch.empty_like(fwd_indices)
+    bwd_indices[fwd_indices] = torch.arange(fwd_indices.shape[0], device=tensor.device)
+    seq_lens = torch.bincount(shifted_indices)
+    seq_batch_indices = torch.arange(seq_lens.shape[0], device=tensor.device, dtype=torch.int32) // OFFSET[0]
+    mask = seq_lens != 0
+    seq_lens = seq_lens[mask].tolist()
+    seq_batch_indices = seq_batch_indices[mask].tolist()
+    return fwd_indices, bwd_indices, seq_lens, seq_batch_indices
+def sparse_windowed_scaled_dot_product_self_attention(
+    qkv: SparseTensor,
+    window_size: int,
+    shift_window: Tuple[int, int, int] = (0, 0, 0)
+) -> SparseTensor:
+    """
+    Apply windowed scaled dot product self attention to a sparse tensor.
+    Args:
+        qkv (SparseTensor): [N, *, 3, H, C] sparse tensor containing Qs, Ks, and Vs.
+        window_size (int): The window size to use.
+        shift_window (Tuple[int, int, int]): The shift of serialized coordinates.
+        shift (int): The shift to use.
+    """
+    assert len(qkv.shape) == 4 and qkv.shape[1] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, *, 3, H, C]"
+    serialization_spatial_cache_name = f'window_partition_{window_size}_{shift_window}'
+    serialization_spatial_cache = qkv.get_spatial_cache(serialization_spatial_cache_name)
+    if serialization_spatial_cache is None:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = calc_window_partition(qkv, window_size, shift_window)
+        qkv.register_spatial_cache(serialization_spatial_cache_name, (fwd_indices, bwd_indices, seq_lens, seq_batch_indices))
+    else:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = serialization_spatial_cache
+    M = fwd_indices.shape[0]
+    T = qkv.feats.shape[0]
+    H = qkv.feats.shape[2]
+    C = qkv.feats.shape[3]
+    qkv_feats = qkv.feats[fwd_indices]      # [M, 3, H, C]
+    if DEBUG:
+        start = 0
+        qkv_coords = qkv.coords[fwd_indices]
+        for i in range(len(seq_lens)):
+            seq_coords = qkv_coords[start:start+seq_lens[i]]
+            assert (seq_coords[:, 0] == seq_batch_indices[i]).all(), f"SparseWindowedScaledDotProductSelfAttention: batch index mismatch"
+            assert (seq_coords[:, 1:].max(dim=0).values - seq_coords[:, 1:].min(dim=0).values < window_size).all(), \
+                    f"SparseWindowedScaledDotProductSelfAttention: window size exceeded"
+            start += seq_lens[i]
+    if all([seq_len == window_size for seq_len in seq_lens]):
+        B = len(seq_lens)
+        N = window_size
+        qkv_feats = qkv_feats.reshape(B, N, 3, H, C)
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=2)                       # [B, N, H, C]
+            out = xops.memory_efficient_attention(q, k, v)          # [B, N, H, C]
+        elif ATTN == 'flash_attn':
+            out = flash_attn.flash_attn_qkvpacked_func(qkv_feats)   # [B, N, H, C]
+        else:
+            raise ValueError(f"Unknown attention module: {ATTN}")
+        out = out.reshape(B * N, H, C)                              # [M, H, C]
+    else:
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=1)                       # [M, H, C]
+            q = q.unsqueeze(0)                                      # [1, M, H, C]
+            k = k.unsqueeze(0)                                      # [1, M, H, C]
+            v = v.unsqueeze(0)                                      # [1, M, H, C]
+            mask = xops.fmha.BlockDiagonalMask.from_seqlens(seq_lens)
+            out = xops.memory_efficient_attention(q, k, v, mask)[0] # [M, H, C]
+        elif ATTN == 'flash_attn':
+            cu_seqlens = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(seq_lens), dim=0)], dim=0) \
+                        .to(qkv.device).int()
+            out = flash_attn.flash_attn_varlen_qkvpacked_func(qkv_feats, cu_seqlens, max(seq_lens)) # [M, H, C]
+    out = out[bwd_indices]      # [T, H, C]
+    if DEBUG:
+        qkv_coords = qkv_coords[bwd_indices]
+        assert torch.equal(qkv_coords, qkv.coords), "SparseWindowedScaledDotProductSelfAttention: coordinate mismatch"
+    return qkv.replace(out)

threeDFixer/modules/sparse/basic.py ADDED Viewed

	@@ -0,0 +1,464 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+import torch
+import torch.nn as nn
+from . import BACKEND, DEBUG
+SparseTensorData = None # Lazy import
+__all__ = [
+    'SparseTensor',
+    'sparse_batch_broadcast',
+    'sparse_batch_op',
+    'sparse_cat',
+    'sparse_unbind',
+]
+class SparseTensor:
+    """
+    Sparse tensor with support for both torchsparse and spconv backends.
+    Parameters:
+    - feats (torch.Tensor): Features of the sparse tensor.
+    - coords (torch.Tensor): Coordinates of the sparse tensor.
+    - shape (torch.Size): Shape of the sparse tensor.
+    - layout (List[slice]): Layout of the sparse tensor for each batch
+    - data (SparseTensorData): Sparse tensor data used for convolusion
+    NOTE:
+    - Data corresponding to a same batch should be contiguous.
+    - Coords should be in [0, 1023]
+    """
+    @overload
+    def __init__(self, feats: torch.Tensor, coords: torch.Tensor, shape: Optional[torch.Size] = None, layout: Optional[List[slice]] = None, **kwargs): ...
+    @overload
+    def __init__(self, data, shape: Optional[torch.Size] = None, layout: Optional[List[slice]] = None, **kwargs): ...
+    def __init__(self, *args, **kwargs):
+        # Lazy import of sparse tensor backend
+        global SparseTensorData
+        if SparseTensorData is None:
+            import importlib
+            if BACKEND == 'torchsparse':
+                SparseTensorData = importlib.import_module('torchsparse').SparseTensor
+            elif BACKEND == 'spconv':
+                SparseTensorData = importlib.import_module('spconv.pytorch').SparseConvTensor
+        method_id = 0
+        if len(args) != 0:
+            method_id = 0 if isinstance(args[0], torch.Tensor) else 1
+        else:
+            method_id = 1 if 'data' in kwargs else 0
+        if method_id == 0:
+            feats, coords, shape, layout = args + (None,) * (4 - len(args))
+            if 'feats' in kwargs:
+                feats = kwargs['feats']
+                del kwargs['feats']
+            if 'coords' in kwargs:
+                coords = kwargs['coords']
+                del kwargs['coords']
+            if 'shape' in kwargs:
+                shape = kwargs['shape']
+                del kwargs['shape']
+            if 'layout' in kwargs:
+                layout = kwargs['layout']
+                del kwargs['layout']
+            if shape is None:
+                shape = self.__cal_shape(feats, coords)
+            if layout is None:
+                layout = self.__cal_layout(coords, shape[0])
+            if BACKEND == 'torchsparse':
+                self.data = SparseTensorData(feats, coords, **kwargs)
+            elif BACKEND == 'spconv':
+                spatial_shape = list(coords.max(0)[0] + 1)[1:]
+                self.data = SparseTensorData(feats.reshape(feats.shape[0], -1), coords, spatial_shape, shape[0], **kwargs)
+                self.data._features = feats
+        elif method_id == 1:
+            data, shape, layout = args + (None,) * (3 - len(args))
+            if 'data' in kwargs:
+                data = kwargs['data']
+                del kwargs['data']
+            if 'shape' in kwargs:
+                shape = kwargs['shape']
+                del kwargs['shape']
+            if 'layout' in kwargs:
+                layout = kwargs['layout']
+                del kwargs['layout']
+            self.data = data
+            if shape is None:
+                shape = self.__cal_shape(self.feats, self.coords)
+            if layout is None:
+                layout = self.__cal_layout(self.coords, shape[0])
+        self._shape = shape
+        self._layout = layout
+        self._scale = kwargs.get('scale', (1, 1, 1))
+        self._spatial_cache = kwargs.get('spatial_cache', {})
+        if DEBUG:
+            try:
+                assert self.feats.shape[0] == self.coords.shape[0], f"Invalid feats shape: {self.feats.shape}, coords shape: {self.coords.shape}"
+                assert self.shape == self.__cal_shape(self.feats, self.coords), f"Invalid shape: {self.shape}"
+                assert self.layout == self.__cal_layout(self.coords, self.shape[0]), f"Invalid layout: {self.layout}"
+                for i in range(self.shape[0]):
+                    assert torch.all(self.coords[self.layout[i], 0] == i), f"The data of batch {i} is not contiguous"
+            except Exception as e:
+                print('Debugging information:')
+                print(f"- Shape: {self.shape}")
+                print(f"- Layout: {self.layout}")
+                print(f"- Scale: {self._scale}")
+                print(f"- Coords: {self.coords}")
+                raise e
+    def __cal_shape(self, feats, coords):
+        shape = []
+        shape.append(coords[:, 0].max().item() + 1)
+        shape.extend([*feats.shape[1:]])
+        return torch.Size(shape)
+    def __cal_layout(self, coords, batch_size):
+        seq_len = torch.bincount(coords[:, 0], minlength=batch_size)
+        offset = torch.cumsum(seq_len, dim=0)
+        layout = [slice((offset[i] - seq_len[i]).item(), offset[i].item()) for i in range(batch_size)]
+        return layout
+    @property
+    def shape(self) -> torch.Size:
+        return self._shape
+    def dim(self) -> int:
+        return len(self.shape)
+    @property
+    def layout(self) -> List[slice]:
+        return self._layout
+    @property
+    def feats(self) -> torch.Tensor:
+        if BACKEND == 'torchsparse':
+            return self.data.F
+        elif BACKEND == 'spconv':
+            return self.data.features
+    @feats.setter
+    def feats(self, value: torch.Tensor):
+        if BACKEND == 'torchsparse':
+            self.data.F = value
+        elif BACKEND == 'spconv':
+            self.data.features = value
+    @property
+    def coords(self) -> torch.Tensor:
+        if BACKEND == 'torchsparse':
+            return self.data.C
+        elif BACKEND == 'spconv':
+            return self.data.indices
+    @coords.setter
+    def coords(self, value: torch.Tensor):
+        if BACKEND == 'torchsparse':
+            self.data.C = value
+        elif BACKEND == 'spconv':
+            self.data.indices = value
+    @property
+    def dtype(self):
+        return self.feats.dtype
+    @property
+    def device(self):
+        return self.feats.device
+    @overload
+    def to(self, dtype: torch.dtype) -> 'SparseTensor': ...
+    @overload
+    def to(self, device: Optional[Union[str, torch.device]] = None, dtype: Optional[torch.dtype] = None) -> 'SparseTensor': ...
+    def to(self, *args, **kwargs) -> 'SparseTensor':
+        device = None
+        dtype = None
+        if len(args) == 2:
+            device, dtype = args
+        elif len(args) == 1:
+            if isinstance(args[0], torch.dtype):
+                dtype = args[0]
+            else:
+                device = args[0]
+        if 'dtype' in kwargs:
+            assert dtype is None, "to() received multiple values for argument 'dtype'"
+            dtype = kwargs['dtype']
+        if 'device' in kwargs:
+            assert device is None, "to() received multiple values for argument 'device'"
+            device = kwargs['device']
+        new_feats = self.feats.to(device=device, dtype=dtype)
+        new_coords = self.coords.to(device=device)
+        return self.replace(new_feats, new_coords)
+    def type(self, dtype):
+        new_feats = self.feats.type(dtype)
+        return self.replace(new_feats)
+    def cpu(self) -> 'SparseTensor':
+        new_feats = self.feats.cpu()
+        new_coords = self.coords.cpu()
+        return self.replace(new_feats, new_coords)
+    def cuda(self) -> 'SparseTensor':
+        new_feats = self.feats.cuda()
+        new_coords = self.coords.cuda()
+        return self.replace(new_feats, new_coords)
+    def half(self) -> 'SparseTensor':
+        new_feats = self.feats.half()
+        return self.replace(new_feats)
+    def float(self) -> 'SparseTensor':
+        new_feats = self.feats.float()
+        return self.replace(new_feats)
+    def detach(self) -> 'SparseTensor':
+        new_coords = self.coords.detach()
+        new_feats = self.feats.detach()
+        return self.replace(new_feats, new_coords)
+    def dense(self) -> torch.Tensor:
+        if BACKEND == 'torchsparse':
+            return self.data.dense()
+        elif BACKEND == 'spconv':
+            return self.data.dense()
+    def reshape(self, *shape) -> 'SparseTensor':
+        new_feats = self.feats.reshape(self.feats.shape[0], *shape)
+        return self.replace(new_feats)
+    def unbind(self, dim: int) -> List['SparseTensor']:
+        return sparse_unbind(self, dim)
+    def replace(self, feats: torch.Tensor, coords: Optional[torch.Tensor] = None) -> 'SparseTensor':
+        new_shape = [self.shape[0]]
+        new_shape.extend(feats.shape[1:])
+        if BACKEND == 'torchsparse':
+            new_data = SparseTensorData(
+                feats=feats,
+                coords=self.data.coords if coords is None else coords,
+                stride=self.data.stride,
+                spatial_range=self.data.spatial_range,
+            )
+            new_data._caches = self.data._caches
+        elif BACKEND == 'spconv':
+            new_data = SparseTensorData(
+                self.data.features.reshape(self.data.features.shape[0], -1),
+                self.data.indices,
+                self.data.spatial_shape,
+                self.data.batch_size,
+                self.data.grid,
+                self.data.voxel_num,
+                self.data.indice_dict
+            )
+            new_data._features = feats
+            new_data.benchmark = self.data.benchmark
+            new_data.benchmark_record = self.data.benchmark_record
+            new_data.thrust_allocator = self.data.thrust_allocator
+            new_data._timer = self.data._timer
+            new_data.force_algo = self.data.force_algo
+            new_data.int8_scale = self.data.int8_scale
+            if coords is not None:
+                new_data.indices = coords
+        new_tensor = SparseTensor(new_data, shape=torch.Size(new_shape), layout=self.layout, scale=self._scale, spatial_cache=self._spatial_cache)
+        return new_tensor
+    @staticmethod
+    def full(aabb, dim, value, dtype=torch.float32, device=None) -> 'SparseTensor':
+        N, C = dim
+        x = torch.arange(aabb[0], aabb[3] + 1)
+        y = torch.arange(aabb[1], aabb[4] + 1)
+        z = torch.arange(aabb[2], aabb[5] + 1)
+        coords = torch.stack(torch.meshgrid(x, y, z, indexing='ij'), dim=-1).reshape(-1, 3)
+        coords = torch.cat([
+            torch.arange(N).view(-1, 1).repeat(1, coords.shape[0]).view(-1, 1),
+            coords.repeat(N, 1),
+        ], dim=1).to(dtype=torch.int32, device=device)
+        feats = torch.full((coords.shape[0], C), value, dtype=dtype, device=device)
+        return SparseTensor(feats=feats, coords=coords)
+    def __merge_sparse_cache(self, other: 'SparseTensor') -> dict:
+        new_cache = {}
+        for k in set(list(self._spatial_cache.keys()) + list(other._spatial_cache.keys())):
+            if k in self._spatial_cache:
+                new_cache[k] = self._spatial_cache[k]
+            if k in other._spatial_cache:
+                if k not in new_cache:
+                    new_cache[k] = other._spatial_cache[k]
+                else:
+                    new_cache[k].update(other._spatial_cache[k])
+        return new_cache
+    def __neg__(self) -> 'SparseTensor':
+        return self.replace(-self.feats)
+    def __elemwise__(self, other: Union[torch.Tensor, 'SparseTensor'], op: callable) -> 'SparseTensor':
+        if isinstance(other, torch.Tensor):
+            try:
+                other = torch.broadcast_to(other, self.shape)
+                other = sparse_batch_broadcast(self, other)
+            except:
+                pass
+        if isinstance(other, SparseTensor):
+            other = other.feats
+        new_feats = op(self.feats, other)
+        new_tensor = self.replace(new_feats)
+        if isinstance(other, SparseTensor):
+            new_tensor._spatial_cache = self.__merge_sparse_cache(other)
+        return new_tensor
+    def __add__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.add)
+    def __radd__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.add)
+    def __sub__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.sub)
+    def __rsub__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, lambda x, y: torch.sub(y, x))
+    def __mul__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.mul)
+    def __rmul__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.mul)
+    def __truediv__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.div)
+    def __rtruediv__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, lambda x, y: torch.div(y, x))
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            idx = [idx]
+        elif isinstance(idx, slice):
+            idx = range(*idx.indices(self.shape[0]))
+        elif isinstance(idx, torch.Tensor):
+            if idx.dtype == torch.bool:
+                assert idx.shape == (self.shape[0],), f"Invalid index shape: {idx.shape}"
+                idx = idx.nonzero().squeeze(1)
+            elif idx.dtype in [torch.int32, torch.int64]:
+                assert len(idx.shape) == 1, f"Invalid index shape: {idx.shape}"
+            else:
+                raise ValueError(f"Unknown index type: {idx.dtype}")
+        else:
+            raise ValueError(f"Unknown index type: {type(idx)}")
+        coords = []
+        feats = []
+        for new_idx, old_idx in enumerate(idx):
+            coords.append(self.coords[self.layout[old_idx]].clone())
+            coords[-1][:, 0] = new_idx
+            feats.append(self.feats[self.layout[old_idx]])
+        coords = torch.cat(coords, dim=0).contiguous()
+        feats = torch.cat(feats, dim=0).contiguous()
+        return SparseTensor(feats=feats, coords=coords)
+    def register_spatial_cache(self, key, value) -> None:
+        """
+        Register a spatial cache.
+        The spatial cache can be any thing you want to cache.
+        The registery and retrieval of the cache is based on current scale.
+        """
+        scale_key = str(self._scale)
+        if scale_key not in self._spatial_cache:
+            self._spatial_cache[scale_key] = {}
+        self._spatial_cache[scale_key][key] = value
+    def get_spatial_cache(self, key=None):
+        """
+        Get a spatial cache.
+        """
+        scale_key = str(self._scale)
+        cur_scale_cache = self._spatial_cache.get(scale_key, {})
+        if key is None:
+            return cur_scale_cache
+        return cur_scale_cache.get(key, None)
+def sparse_batch_broadcast(input: SparseTensor, other: torch.Tensor) -> torch.Tensor:
+    """
+    Broadcast a 1D tensor to a sparse tensor along the batch dimension then perform an operation.
+    Args:
+        input (torch.Tensor): 1D tensor to broadcast.
+        target (SparseTensor): Sparse tensor to broadcast to.
+        op (callable): Operation to perform after broadcasting. Defaults to torch.add.
+    """
+    coords, feats = input.coords, input.feats
+    broadcasted = torch.zeros_like(feats)
+    for k in range(input.shape[0]):
+        broadcasted[input.layout[k]] = other[k]
+    return broadcasted
+def sparse_batch_op(input: SparseTensor, other: torch.Tensor, op: callable = torch.add) -> SparseTensor:
+    """
+    Broadcast a 1D tensor to a sparse tensor along the batch dimension then perform an operation.
+    Args:
+        input (torch.Tensor): 1D tensor to broadcast.
+        target (SparseTensor): Sparse tensor to broadcast to.
+        op (callable): Operation to perform after broadcasting. Defaults to torch.add.
+    """
+    return input.replace(op(input.feats, sparse_batch_broadcast(input, other)))
+def sparse_cat(inputs: List[SparseTensor], dim: int = 0) -> SparseTensor:
+    """
+    Concatenate a list of sparse tensors.
+    Args:
+        inputs (List[SparseTensor]): List of sparse tensors to concatenate.
+    """
+    if dim == 0:
+        start = 0
+        coords = []
+        for input in inputs:
+            coords.append(input.coords.clone())
+            coords[-1][:, 0] += start
+            start += input.shape[0]
+        coords = torch.cat(coords, dim=0)
+        feats = torch.cat([input.feats for input in inputs], dim=0)
+        output = SparseTensor(
+            coords=coords,
+            feats=feats,
+        )
+    else:
+        feats = torch.cat([input.feats for input in inputs], dim=dim)
+        output = inputs[0].replace(feats)
+    return output
+def sparse_unbind(input: SparseTensor, dim: int) -> List[SparseTensor]:
+    """
+    Unbind a sparse tensor along a dimension.
+    Args:
+        input (SparseTensor): Sparse tensor to unbind.
+        dim (int): Dimension to unbind.
+    """
+    if dim == 0:
+        return [input[i] for i in range(input.shape[0])]
+    else:
+        feats = input.feats.unbind(dim)
+        return [input.replace(f) for f in feats]

threeDFixer/modules/sparse/conv/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from .. import BACKEND
+SPCONV_ALGO = 'auto'    # 'auto', 'implicit_gemm', 'native'
+def __from_env():
+    import os
+    global SPCONV_ALGO
+    env_spconv_algo = os.environ.get('SPCONV_ALGO')
+    if env_spconv_algo is not None and env_spconv_algo in ['auto', 'implicit_gemm', 'native']:
+        SPCONV_ALGO = env_spconv_algo
+    print(f"[SPARSE][CONV] spconv algo: {SPCONV_ALGO}")
+__from_env()
+if BACKEND == 'torchsparse':
+    from .conv_torchsparse import *
+elif BACKEND == 'spconv':
+    from .conv_spconv import *

threeDFixer/modules/sparse/conv/conv_spconv.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+import torch
+import torch.nn as nn
+from .. import SparseTensor
+from .. import DEBUG
+from . import SPCONV_ALGO
+class SparseConv3d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, padding=None, bias=True, indice_key=None):
+        super(SparseConv3d, self).__init__()
+        if 'spconv' not in globals():
+            import spconv.pytorch as spconv
+        algo = None
+        if SPCONV_ALGO == 'native':
+            algo = spconv.ConvAlgo.Native
+        elif SPCONV_ALGO == 'implicit_gemm':
+            algo = spconv.ConvAlgo.MaskImplicitGemm
+        if stride == 1 and (padding is None):
+            self.conv = spconv.SubMConv3d(in_channels, out_channels, kernel_size, dilation=dilation, bias=bias, indice_key=indice_key, algo=algo)
+        else:
+            self.conv = spconv.SparseConv3d(in_channels, out_channels, kernel_size, stride=stride, dilation=dilation, padding=padding, bias=bias, indice_key=indice_key, algo=algo)
+        self.stride = tuple(stride) if isinstance(stride, (list, tuple)) else (stride, stride, stride)
+        self.padding = padding
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        spatial_changed = any(s != 1 for s in self.stride) or (self.padding is not None)
+        new_data = self.conv(x.data)
+        new_shape = [x.shape[0], self.conv.out_channels]
+        new_layout = None if spatial_changed else x.layout
+        if spatial_changed and (x.shape[0] != 1):
+            # spconv was non-1 stride will break the contiguous of the output tensor, sort by the coords
+            fwd = new_data.indices[:, 0].argsort()
+            bwd = torch.zeros_like(fwd).scatter_(0, fwd, torch.arange(fwd.shape[0], device=fwd.device))
+            sorted_feats = new_data.features[fwd]
+            sorted_coords = new_data.indices[fwd]
+            unsorted_data = new_data
+            new_data = spconv.SparseConvTensor(sorted_feats, sorted_coords, unsorted_data.spatial_shape, unsorted_data.batch_size)  # type: ignore
+        out = SparseTensor(
+            new_data, shape=torch.Size(new_shape), layout=new_layout,
+            scale=tuple([s * stride for s, stride in zip(x._scale, self.stride)]),
+            spatial_cache=x._spatial_cache,
+        )
+        if spatial_changed and (x.shape[0] != 1):
+            out.register_spatial_cache(f'conv_{self.stride}_unsorted_data', unsorted_data)
+            out.register_spatial_cache(f'conv_{self.stride}_sort_bwd', bwd)
+        return out
+class SparseInverseConv3d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, bias=True, indice_key=None):
+        super(SparseInverseConv3d, self).__init__()
+        if 'spconv' not in globals():
+            import spconv.pytorch as spconv
+        self.conv = spconv.SparseInverseConv3d(in_channels, out_channels, kernel_size, bias=bias, indice_key=indice_key)
+        self.stride = tuple(stride) if isinstance(stride, (list, tuple)) else (stride, stride, stride)
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        spatial_changed = any(s != 1 for s in self.stride)
+        if spatial_changed:
+            # recover the original spconv order
+            data = x.get_spatial_cache(f'conv_{self.stride}_unsorted_data')
+            bwd = x.get_spatial_cache(f'conv_{self.stride}_sort_bwd')
+            data = data.replace_feature(x.feats[bwd])
+            if DEBUG:
+                assert torch.equal(data.indices, x.coords[bwd]), 'Recover the original order failed'
+        else:
+            data = x.data
+        new_data = self.conv(data)
+        new_shape = [x.shape[0], self.conv.out_channels]
+        new_layout = None if spatial_changed else x.layout
+        out = SparseTensor(
+            new_data, shape=torch.Size(new_shape), layout=new_layout,
+            scale=tuple([s // stride for s, stride in zip(x._scale, self.stride)]),
+            spatial_cache=x._spatial_cache,
+        )
+        return out

threeDFixer/modules/sparse/conv/conv_torchsparse.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+import torch
+import torch.nn as nn
+from .. import SparseTensor
+class SparseConv3d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, bias=True, indice_key=None):
+        super(SparseConv3d, self).__init__()
+        if 'torchsparse' not in globals():
+            import torchsparse
+        self.conv = torchsparse.nn.Conv3d(in_channels, out_channels, kernel_size, stride, 0, dilation, bias)
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        out = self.conv(x.data)
+        new_shape = [x.shape[0], self.conv.out_channels]
+        out = SparseTensor(out, shape=torch.Size(new_shape), layout=x.layout if all(s == 1 for s in self.conv.stride) else None)
+        out._spatial_cache = x._spatial_cache
+        out._scale = tuple([s * stride for s, stride in zip(x._scale, self.conv.stride)])
+        return out
+class SparseInverseConv3d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, bias=True, indice_key=None):
+        super(SparseInverseConv3d, self).__init__()
+        if 'torchsparse' not in globals():
+            import torchsparse
+        self.conv = torchsparse.nn.Conv3d(in_channels, out_channels, kernel_size, stride, 0, dilation, bias, transposed=True)
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        out = self.conv(x.data)
+        new_shape = [x.shape[0], self.conv.out_channels]
+        out = SparseTensor(out, shape=torch.Size(new_shape), layout=x.layout if all(s == 1 for s in self.conv.stride) else None)
+        out._spatial_cache = x._spatial_cache
+        out._scale = tuple([s // stride for s, stride in zip(x._scale, self.conv.stride)])
+        return out

threeDFixer/modules/sparse/linear.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+import torch
+import torch.nn as nn
+from . import SparseTensor
+__all__ = [
+    'SparseLinear'
+]
+class SparseLinear(nn.Linear):
+    def __init__(self, in_features, out_features, bias=True):
+        super(SparseLinear, self).__init__(in_features, out_features, bias)
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(super().forward(input.feats))

threeDFixer/modules/sparse/nonlinearity.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+import torch
+import torch.nn as nn
+from . import SparseTensor
+__all__ = [
+    'SparseReLU',
+    'SparseSiLU',
+    'SparseGELU',
+    'SparseActivation'
+]
+class SparseReLU(nn.ReLU):
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(super().forward(input.feats))
+class SparseSiLU(nn.SiLU):
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(super().forward(input.feats))
+class SparseGELU(nn.GELU):
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(super().forward(input.feats))
+class SparseActivation(nn.Module):
+    def __init__(self, activation: nn.Module):
+        super().__init__()
+        self.activation = activation
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(self.activation(input.feats))

threeDFixer/modules/sparse/norm.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+import torch
+import torch.nn as nn
+from . import SparseTensor
+from . import DEBUG
+__all__ = [
+    'SparseGroupNorm',
+    'SparseLayerNorm',
+    'SparseGroupNorm32',
+    'SparseLayerNorm32',
+]
+class SparseGroupNorm(nn.GroupNorm):
+    def __init__(self, num_groups, num_channels, eps=1e-5, affine=True):
+        super(SparseGroupNorm, self).__init__(num_groups, num_channels, eps, affine)
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        nfeats = torch.zeros_like(input.feats)
+        for k in range(input.shape[0]):
+            if DEBUG:
+                assert (input.coords[input.layout[k], 0] == k).all(), f"SparseGroupNorm: batch index mismatch"
+            bfeats = input.feats[input.layout[k]]
+            bfeats = bfeats.permute(1, 0).reshape(1, input.shape[1], -1)
+            bfeats = super().forward(bfeats)
+            bfeats = bfeats.reshape(input.shape[1], -1).permute(1, 0)
+            nfeats[input.layout[k]] = bfeats
+        return input.replace(nfeats)
+class SparseLayerNorm(nn.LayerNorm):
+    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
+        super(SparseLayerNorm, self).__init__(normalized_shape, eps, elementwise_affine)
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        nfeats = torch.zeros_like(input.feats)
+        for k in range(input.shape[0]):
+            bfeats = input.feats[input.layout[k]]
+            bfeats = bfeats.permute(1, 0).reshape(1, input.shape[1], -1)
+            bfeats = super().forward(bfeats)
+            bfeats = bfeats.reshape(input.shape[1], -1).permute(1, 0)
+            nfeats[input.layout[k]] = bfeats
+        return input.replace(nfeats)
+class SparseGroupNorm32(SparseGroupNorm):
+    """
+    A GroupNorm layer that converts to float32 before the forward pass.
+    """
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        return super().forward(x.float()).type(x.dtype)
+class SparseLayerNorm32(SparseLayerNorm):
+    """
+    A LayerNorm layer that converts to float32 before the forward pass.
+    """
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        return super().forward(x.float()).type(x.dtype)

threeDFixer/modules/sparse/spatial.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+import torch
+import torch.nn as nn
+from . import SparseTensor
+__all__ = [
+    'SparseDownsample',
+    'SparseUpsample',
+    'SparseSubdivide'
+]
+class SparseDownsample(nn.Module):
+    """
+    Downsample a sparse tensor by a factor of `factor`.
+    Implemented as average pooling.
+    """
+    def __init__(self, factor: Union[int, Tuple[int, ...], List[int]]):
+        super(SparseDownsample, self).__init__()
+        self.factor = tuple(factor) if isinstance(factor, (list, tuple)) else factor
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        DIM = input.coords.shape[-1] - 1
+        factor = self.factor if isinstance(self.factor, tuple) else (self.factor,) * DIM
+        assert DIM == len(factor), 'Input coordinates must have the same dimension as the downsample factor.'
+        coord = list(input.coords.unbind(dim=-1))
+        for i, f in enumerate(factor):
+            coord[i+1] = coord[i+1] // f
+        MAX = [coord[i+1].max().item() + 1 for i in range(DIM)]
+        OFFSET = torch.cumprod(torch.tensor(MAX[::-1]), 0).tolist()[::-1] + [1]
+        code = sum([c * o for c, o in zip(coord, OFFSET)])
+        code, idx = code.unique(return_inverse=True)
+        new_feats = torch.scatter_reduce(
+            torch.zeros(code.shape[0], input.feats.shape[1], device=input.feats.device, dtype=input.feats.dtype),
+            dim=0,
+            index=idx.unsqueeze(1).expand(-1, input.feats.shape[1]),
+            src=input.feats,
+            reduce='mean'
+        )
+        new_coords = torch.stack(
+            [code // OFFSET[0]] +
+            [(code // OFFSET[i+1]) % MAX[i] for i in range(DIM)],
+            dim=-1
+        )
+        out = SparseTensor(new_feats, new_coords, input.shape,)
+        out._scale = tuple([s // f for s, f in zip(input._scale, factor)])
+        out._spatial_cache = input._spatial_cache
+        out.register_spatial_cache(f'upsample_{factor}_coords', input.coords)
+        out.register_spatial_cache(f'upsample_{factor}_layout', input.layout)
+        out.register_spatial_cache(f'upsample_{factor}_idx', idx)
+        return out
+class SparseUpsample(nn.Module):
+    """
+    Upsample a sparse tensor by a factor of `factor`.
+    Implemented as nearest neighbor interpolation.
+    """
+    def __init__(self, factor: Union[int, Tuple[int, int, int], List[int]]):
+        super(SparseUpsample, self).__init__()
+        self.factor = tuple(factor) if isinstance(factor, (list, tuple)) else factor
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        DIM = input.coords.shape[-1] - 1
+        factor = self.factor if isinstance(self.factor, tuple) else (self.factor,) * DIM
+        assert DIM == len(factor), 'Input coordinates must have the same dimension as the upsample factor.'
+        new_coords = input.get_spatial_cache(f'upsample_{factor}_coords')
+        new_layout = input.get_spatial_cache(f'upsample_{factor}_layout')
+        idx = input.get_spatial_cache(f'upsample_{factor}_idx')
+        if any([x is None for x in [new_coords, new_layout, idx]]):
+            raise ValueError('Upsample cache not found. SparseUpsample must be paired with SparseDownsample.')
+        new_feats = input.feats[idx]
+        out = SparseTensor(new_feats, new_coords, input.shape, new_layout)
+        out._scale = tuple([s * f for s, f in zip(input._scale, factor)])
+        out._spatial_cache = input._spatial_cache
+        return out
+class SparseSubdivide(nn.Module):
+    """
+    Upsample a sparse tensor by a factor of `factor`.
+    Implemented as nearest neighbor interpolation.
+    """
+    def __init__(self):
+        super(SparseSubdivide, self).__init__()
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        DIM = input.coords.shape[-1] - 1
+        # upsample scale=2^DIM
+        n_cube = torch.ones([2] * DIM, device=input.device, dtype=torch.int)
+        n_coords = torch.nonzero(n_cube)
+        n_coords = torch.cat([torch.zeros_like(n_coords[:, :1]), n_coords], dim=-1)
+        factor = n_coords.shape[0]
+        assert factor == 2 ** DIM
+        # print(n_coords.shape)
+        new_coords = input.coords.clone()
+        new_coords[:, 1:] *= 2
+        new_coords = new_coords.unsqueeze(1) + n_coords.unsqueeze(0).to(new_coords.dtype)
+        new_feats = input.feats.unsqueeze(1).expand(input.feats.shape[0], factor, *input.feats.shape[1:])
+        out = SparseTensor(new_feats.flatten(0, 1), new_coords.flatten(0, 1), input.shape)
+        out._scale = input._scale * 2
+        out._spatial_cache = input._spatial_cache
+        return out

threeDFixer/modules/sparse/transformer/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from .blocks import *
+from .modulated import *

threeDFixer/modules/sparse/transformer/blocks.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+import torch
+import torch.nn as nn
+from ..basic import SparseTensor
+from ..linear import SparseLinear
+from ..nonlinearity import SparseGELU
+from ..attention import SparseMultiHeadAttention, SerializeMode
+from ...norm import LayerNorm32
+class SparseFeedForwardNet(nn.Module):
+    def __init__(self, channels: int, mlp_ratio: float = 4.0):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            SparseLinear(channels, int(channels * mlp_ratio)),
+            SparseGELU(approximate="tanh"),
+            SparseLinear(int(channels * mlp_ratio), channels),
+        )
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        return self.mlp(x)
+class SparseTransformerBlock(nn.Module):
+    """
+    Sparse Transformer block (MSA + FFN).
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qkv_bias: bool = True,
+        ln_affine: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+    def _forward(self, x: SparseTensor) -> SparseTensor:
+        h = x.replace(self.norm1(x.feats))
+        h = self.attn(h)
+        x = x + h
+        h = x.replace(self.norm2(x.feats))
+        h = self.mlp(h)
+        x = x + h
+        return x
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, use_reentrant=False)
+        else:
+            return self._forward(x)
+class SparseTransformerCrossBlock(nn.Module):
+    """
+    Sparse Transformer cross-attention block (MSA + MCA + FFN).
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        ln_affine: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.self_attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn = SparseMultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+    def _forward(self, x: SparseTensor, mod: torch.Tensor, context: torch.Tensor):
+        h = x.replace(self.norm1(x.feats))
+        h = self.self_attn(h)
+        x = x + h
+        h = x.replace(self.norm2(x.feats))
+        h = self.cross_attn(h, context)
+        x = x + h
+        h = x.replace(self.norm3(x.feats))
+        h = self.mlp(h)
+        x = x + h
+        return x
+    def forward(self, x: SparseTensor, context: torch.Tensor):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, context, use_reentrant=False)
+        else:
+            return self._forward(x, context)

threeDFixer/modules/sparse/transformer/modulated.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# This file is modified from TRELLIS:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+# Modifications Copyright (c) 2026 Ze-Xin Yin, Robot labs of Horizon Robotics, and D-Robotics.
+from typing import *
+import torch
+import torch.nn as nn
+from ..basic import SparseTensor
+from ..attention import SparseMultiHeadAttention, SerializeMode
+from ...norm import LayerNorm32
+from .blocks import SparseFeedForwardNet
+class ModulatedSparseTransformerBlock(nn.Module):
+    """
+    Sparse Transformer block (MSA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+    def _forward(self, x: SparseTensor, mod: torch.Tensor) -> SparseTensor:
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+        h = x.replace(self.norm1(x.feats))
+        h = h * (1 + scale_msa) + shift_msa
+        h = self.attn(h)
+        h = h * gate_msa
+        x = x + h
+        h = x.replace(self.norm2(x.feats))
+        h = h * (1 + scale_mlp) + shift_mlp
+        h = self.mlp(h)
+        h = h * gate_mlp
+        x = x + h
+        return x
+    def forward(self, x: SparseTensor, mod: torch.Tensor) -> SparseTensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, use_reentrant=False)
+        else:
+            return self._forward(x, mod)
+class ModulatedSparseTransformerCrossBlock(nn.Module):
+    """
+    Sparse Transformer cross-attention block (MSA + MCA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.self_attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn = SparseMultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+    def _forward(self, x: SparseTensor, mod: torch.Tensor, context: torch.Tensor) -> SparseTensor:
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+        h = x.replace(self.norm1(x.feats))
+        h = h * (1 + scale_msa) + shift_msa
+        h = self.self_attn(h)
+        h = h * gate_msa
+        x = x + h
+        h = x.replace(self.norm2(x.feats))
+        h = self.cross_attn(h, context)
+        x = x + h
+        h = x.replace(self.norm3(x.feats))
+        h = h * (1 + scale_mlp) + shift_mlp
+        h = self.mlp(h)
+        h = h * gate_mlp
+        x = x + h
+        return x
+    def forward(self, x: SparseTensor, mod: torch.Tensor, context: torch.Tensor) -> SparseTensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, context, use_reentrant=False)
+        else:
+            return self._forward(x, mod, context)
+class ModulatedSceneSparseTransformerCrossBlock(nn.Module):
+    """
+    Sparse Transformer cross-attention block (MSA + MCA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm4 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm5 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.self_attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn = SparseMultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.self_attn_vis_ratio = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn_extra = SparseMultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+            self.adaLN_modulation_vis = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 3 * channels, bias=True)
+            )
+    def _forward(self, x: SparseTensor, mod: torch.Tensor, vis_mod: torch.Tensor, context: torch.Tensor, context_extra: torch.Tensor) -> SparseTensor:
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+            vis_shift_msa, vis_scale_msa, vis_gate_msa = vis_mod.chunk(3, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+            vis_shift_msa, vis_scale_msa, vis_gate_msa = self.adaLN_modulation_vis(vis_mod).chunk(3, dim=1)
+        h = x.replace(self.norm1(x.feats))
+        h = h * (1 + scale_msa) + shift_msa
+        h = self.self_attn(h)
+        h = h * gate_msa
+        x = x + h
+        h = x.replace(self.norm2(x.feats))
+        h = self.cross_attn(h, context)
+        x = x + h
+        ####### self attn to integrate vis ratio
+        h = x.replace(self.norm4(x.feats))
+        h = h * (1 + vis_scale_msa) + vis_shift_msa
+        h = self.self_attn_vis_ratio(h)
+        h = h * vis_gate_msa
+        x = x + h
+        # cross attn for integrate extra info
+        h = x.replace(self.norm5(x.feats))
+        h = self.cross_attn_extra(h, context_extra)
+        x = x + h
+        #######
+        h = x.replace(self.norm3(x.feats))
+        h = h * (1 + scale_mlp) + shift_mlp
+        h = self.mlp(h)
+        h = h * gate_mlp
+        x = x + h
+        return x
+    def forward(self, x: SparseTensor, mod: torch.Tensor, vis_mod: torch.Tensor, context: torch.Tensor, context_extra: torch.Tensor) -> SparseTensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, vis_mod, context, context_extra, use_reentrant=False)
+        else:
+            return self._forward(x, mod, vis_mod, context, context_extra)

threeDFixer/modules/spatial.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+import torch
+def pixel_shuffle_3d(x: torch.Tensor, scale_factor: int) -> torch.Tensor:
+    """
+    3D pixel shuffle.
+    """
+    B, C, H, W, D = x.shape
+    C_ = C // scale_factor**3
+    x = x.reshape(B, C_, scale_factor, scale_factor, scale_factor, H, W, D)
+    x = x.permute(0, 1, 5, 2, 6, 3, 7, 4)
+    x = x.reshape(B, C_, H*scale_factor, W*scale_factor, D*scale_factor)
+    return x
+def patchify(x: torch.Tensor, patch_size: int):
+    """
+    Patchify a tensor.
+    Args:
+        x (torch.Tensor): (N, C, *spatial) tensor
+        patch_size (int): Patch size
+    """
+    DIM = x.dim() - 2
+    for d in range(2, DIM + 2):
+        assert x.shape[d] % patch_size == 0, f"Dimension {d} of input tensor must be divisible by patch size, got {x.shape[d]} and {patch_size}"
+    x = x.reshape(*x.shape[:2], *sum([[x.shape[d] // patch_size, patch_size] for d in range(2, DIM + 2)], []))
+    x = x.permute(0, 1, *([2 * i + 3 for i in range(DIM)] + [2 * i + 2 for i in range(DIM)]))
+    x = x.reshape(x.shape[0], x.shape[1] * (patch_size ** DIM), *(x.shape[-DIM:]))
+    return x
+def unpatchify(x: torch.Tensor, patch_size: int):
+    """
+    Unpatchify a tensor.
+    Args:
+        x (torch.Tensor): (N, C, *spatial) tensor
+        patch_size (int): Patch size
+    """
+    DIM = x.dim() - 2
+    assert x.shape[1] % (patch_size ** DIM) == 0, f"Second dimension of input tensor must be divisible by patch size to unpatchify, got {x.shape[1]} and {patch_size ** DIM}"
+    x = x.reshape(x.shape[0], x.shape[1] // (patch_size ** DIM), *([patch_size] * DIM), *(x.shape[-DIM:]))
+    x = x.permute(0, 1, *(sum([[2 + DIM + i, 2 + i] for i in range(DIM)], [])))
+    x = x.reshape(x.shape[0], x.shape[1], *[x.shape[2 + 2 * i] * patch_size for i in range(DIM)])
+    return x

threeDFixer/modules/transformer/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .blocks import *
2	+ from .modulated import *

threeDFixer/modules/transformer/blocks.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+from typing import *
+import torch
+import torch.nn as nn
+from ..attention import MultiHeadAttention
+from ..norm import LayerNorm32
+class AbsolutePositionEmbedder(nn.Module):
+    """
+    Embeds spatial positions into vector representations.
+    """
+    def __init__(self, channels: int, in_channels: int = 3):
+        super().__init__()
+        self.channels = channels
+        self.in_channels = in_channels
+        self.freq_dim = channels // in_channels // 2
+        self.freqs = torch.arange(self.freq_dim, dtype=torch.float32) / self.freq_dim
+        self.freqs = 1.0 / (10000 ** self.freqs)
+    def _sin_cos_embedding(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Create sinusoidal position embeddings.
+        Args:
+            x: a 1-D Tensor of N indices
+        Returns:
+            an (N, D) Tensor of positional embeddings.
+        """
+        self.freqs = self.freqs.to(x.device)
+        out = torch.outer(x, self.freqs)
+        out = torch.cat([torch.sin(out), torch.cos(out)], dim=-1)
+        return out
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): (N, D) tensor of spatial positions
+        """
+        N, D = x.shape
+        assert D == self.in_channels, "Input dimension must match number of input channels"
+        embed = self._sin_cos_embedding(x.reshape(-1))
+        embed = embed.reshape(N, -1)
+        if embed.shape[1] < self.channels:
+            embed = torch.cat([embed, torch.zeros(N, self.channels - embed.shape[1], device=embed.device)], dim=-1)
+        return embed
+class FeedForwardNet(nn.Module):
+    def __init__(self, channels: int, mlp_ratio: float = 4.0):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(channels, int(channels * mlp_ratio)),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(int(channels * mlp_ratio), channels),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.mlp(x)
+class TransformerBlock(nn.Module):
+    """
+    Transformer block (MSA + FFN).
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[int] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qkv_bias: bool = True,
+        ln_affine: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.attn = MultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.mlp = FeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.norm1(x)
+        h = self.attn(h)
+        x = x + h
+        h = self.norm2(x)
+        h = self.mlp(h)
+        x = x + h
+        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, use_reentrant=False)
+        else:
+            return self._forward(x)
+class TransformerCrossBlock(nn.Module):
+    """
+    Transformer cross-attention block (MSA + MCA + FFN).
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        ln_affine: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.self_attn = MultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn = MultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.mlp = FeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+    def _forward(self, x: torch.Tensor, context: torch.Tensor):
+        h = self.norm1(x)
+        h = self.self_attn(h)
+        x = x + h
+        h = self.norm2(x)
+        h = self.cross_attn(h, context)
+        x = x + h
+        h = self.norm3(x)
+        h = self.mlp(h)
+        x = x + h
+        return x
+    def forward(self, x: torch.Tensor, context: torch.Tensor):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, context, use_reentrant=False)
+        else:
+            return self._forward(x, context)

threeDFixer/modules/transformer/modulated.py ADDED Viewed

	@@ -0,0 +1,289 @@

+# This file is modified from TRELLIS:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+# Modifications Copyright (c) 2026 Ze-Xin Yin, Robot labs of Horizon Robotics, and D-Robotics.
+from typing import *
+import torch
+import torch.nn as nn
+from ..attention import MultiHeadAttention
+from ..norm import LayerNorm32
+from .blocks import FeedForwardNet
+class ModulatedTransformerBlock(nn.Module):
+    """
+    Transformer block (MSA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.attn = MultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.mlp = FeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+    def _forward(self, x: torch.Tensor, mod: torch.Tensor) -> torch.Tensor:
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+        h = self.norm1(x)
+        h = h * (1 + scale_msa.unsqueeze(1)) + shift_msa.unsqueeze(1)
+        h = self.attn(h)
+        h = h * gate_msa.unsqueeze(1)
+        x = x + h
+        h = self.norm2(x)
+        h = h * (1 + scale_mlp.unsqueeze(1)) + shift_mlp.unsqueeze(1)
+        h = self.mlp(h)
+        h = h * gate_mlp.unsqueeze(1)
+        x = x + h
+        return x
+    def forward(self, x: torch.Tensor, mod: torch.Tensor) -> torch.Tensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, use_reentrant=False)
+        else:
+            return self._forward(x, mod)
+class ModulatedTransformerCrossBlock(nn.Module):
+    """
+    Transformer cross-attention block (MSA + MCA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.self_attn = MultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn = MultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.mlp = FeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+    def _forward(self, x: torch.Tensor, mod: torch.Tensor, context: torch.Tensor):
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+        h = self.norm1(x)
+        h = h * (1 + scale_msa.unsqueeze(1)) + shift_msa.unsqueeze(1)
+        h = self.self_attn(h)
+        h = h * gate_msa.unsqueeze(1)
+        x = x + h
+        h = self.norm2(x)
+        h = self.cross_attn(h, context)
+        x = x + h
+        h = self.norm3(x)
+        h = h * (1 + scale_mlp.unsqueeze(1)) + shift_mlp.unsqueeze(1)
+        h = self.mlp(h)
+        h = h * gate_mlp.unsqueeze(1)
+        x = x + h
+        return x
+    def forward(self, x: torch.Tensor, mod: torch.Tensor, context: torch.Tensor):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, context, use_reentrant=False)
+        else:
+            return self._forward(x, mod, context)
+class SceneModulatedTransformerCrossBlock(nn.Module):
+    """
+    Transformer cross-attention block (MSA + MCA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm4 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm5 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.self_attn = MultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn = MultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.self_attn_dpt_ratio = MultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn_extra = MultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.mlp = FeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+            self.adaLN_modulation_dpt = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 3 * channels, bias=True)
+            )
+    def _forward(self, x: torch.Tensor, mod: torch.Tensor, dpt_mod: torch.Tensor, context: torch.Tensor, context_extra: torch.Tensor):
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+            dpt_shift_msa, dpt_scale_msa, dpt_gate_msa = dpt_mod.chunk(3, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+            dpt_shift_msa, dpt_scale_msa, dpt_gate_msa = self.adaLN_modulation_dpt(dpt_mod).chunk(3, dim=1)
+        h = self.norm1(x)
+        h = h * (1 + scale_msa.unsqueeze(1)) + shift_msa.unsqueeze(1)
+        h = self.self_attn(h)
+        h = h * gate_msa.unsqueeze(1)
+        x = x + h
+        h = self.norm2(x)
+        h = self.cross_attn(h, context)
+        x = x + h
+        ####### self attn to integrate dpt ratio
+        h = self.norm4(x)
+        h = h * (1 + dpt_scale_msa.unsqueeze(1)) + dpt_shift_msa.unsqueeze(1)
+        h = self.self_attn_dpt_ratio(h)
+        h = h * dpt_gate_msa.unsqueeze(1)
+        x = x + h
+        # cross attn for integrate extra info
+        h = self.norm5(x)
+        h = self.cross_attn_extra(h, context_extra)
+        x = x + h
+        #######
+        h = self.norm3(x)
+        h = h * (1 + scale_mlp.unsqueeze(1)) + shift_mlp.unsqueeze(1)
+        h = self.mlp(h)
+        h = h * gate_mlp.unsqueeze(1)
+        x = x + h
+        return x
+    def forward(self, x: torch.Tensor, mod: torch.Tensor, dpt_mod: torch.Tensor, context: torch.Tensor, context_extra: torch.Tensor):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, dpt_mod, context, context_extra, use_reentrant=False)
+        else:
+            return self._forward(x, mod, dpt_mod, context, context_extra)

threeDFixer/modules/utils.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copied from the TRELLIS project:
+# https://github.com/microsoft/TRELLIS
+# Original license: MIT
+# Copyright (c) the TRELLIS authors
+import torch.nn as nn
+from ..modules import sparse as sp
+FP16_MODULES = (
+    nn.Conv1d,
+    nn.Conv2d,
+    nn.Conv3d,
+    nn.ConvTranspose1d,
+    nn.ConvTranspose2d,
+    nn.ConvTranspose3d,
+    nn.Linear,
+    sp.SparseConv3d,
+    sp.SparseInverseConv3d,
+    sp.SparseLinear,
+)
+def convert_module_to_f16(l):
+    """
+    Convert primitive modules to float16.
+    """
+    if isinstance(l, FP16_MODULES):
+        for p in l.parameters():
+            p.data = p.data.half()
+def convert_module_to_f32(l):
+    """
+    Convert primitive modules to float32, undoing convert_module_to_f16().
+    """
+    if isinstance(l, FP16_MODULES):
+        for p in l.parameters():
+            p.data = p.data.float()
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)

threeDFixer/moge/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors