Spaces:

TencentARC
/

Pixal3D

Running

App Files Files Community

Yang2001 commited on 4 days ago

Commit

551545a

1 Parent(s): 53ad659

Fix device mismatch, use remote RMBG client, improve progress tracking, translate comments to English

Browse files

Files changed (8) hide show

app.py +13 -2
autotune_cache.json +30 -0
trellis2/datasets/components.py +4 -4
trellis2/datasets/sparse_structure_latent.py +1 -1
trellis2/datasets/structured_latent_shape.py +1 -1
trellis2/pipelines/pixal3d_image_to_3d.py +7 -7
trellis2/pipelines/trellis2_image_to_3d.py +1 -1
trellis2/trainers/basic.py +20 -20

app.py CHANGED Viewed

@@ -140,9 +140,15 @@ def init_models():
         pipeline.image_cond_model_shape_1024 = build_image_cond_model(IMAGE_COND_CONFIGS["shape_1024"])
         pipeline.image_cond_model_tex_1024 = build_image_cond_model(IMAGE_COND_CONFIGS["tex_1024"])
-        pipeline.cuda()
         pipeline.rembg_model = None  # Use remote BRIA-RMBG-2.0 instead
         pipeline.low_vram = False
         print("[NAF] Pre-loading NAF upsampler model...")
         for attr in ['image_cond_model_ss', 'image_cond_model_shape_512', 'image_cond_model_shape_1024', 'image_cond_model_tex_1024']:
@@ -328,6 +334,10 @@ class _TqdmProgressInterceptor(_original_tqdm):
         self._stage_desc = kwargs.get('desc', 'Processing')
         super().__init__(*args, **kwargs)
     def update(self, n=1):
         super().update(n)
         _update_progress(self._stage_desc, self.n, self.total or 0)
@@ -339,6 +349,8 @@ import trellis2.pipelines.samplers.flow_euler as _fe_module
 _fe_module.tqdm = _TqdmProgressInterceptor
 import trellis2.utils.render_utils as _ru_module
 _ru_module.tqdm = _TqdmProgressInterceptor
 # ============================================================================
 # API Implementation
@@ -494,7 +506,6 @@ def extract_glb_api(state_path: str, decimation_target: int, texture_size: int,
     mesh = pipeline.decode_latent(shape_slat, tex_slat, res)[0]
     _update_progress("Decoding latent", 1, 1)
-    _update_progress("Extracting GLB mesh", 0, 1)
     glb = o_voxel.postprocess.to_glb(
         vertices=mesh.vertices, faces=mesh.faces, attr_volume=mesh.attrs,
         coords=mesh.coords, attr_layout=pipeline.pbr_attr_layout,

         pipeline.image_cond_model_shape_1024 = build_image_cond_model(IMAGE_COND_CONFIGS["shape_1024"])
         pipeline.image_cond_model_tex_1024 = build_image_cond_model(IMAGE_COND_CONFIGS["tex_1024"])
         pipeline.rembg_model = None  # Use remote BRIA-RMBG-2.0 instead
         pipeline.low_vram = False
+        pipeline.cuda()
+        # Ensure image_cond_models are on GPU
+        pipeline.image_cond_model_ss.cuda()
+        pipeline.image_cond_model_shape_512.cuda()
+        pipeline.image_cond_model_shape_1024.cuda()
+        pipeline.image_cond_model_tex_1024.cuda()
         print("[NAF] Pre-loading NAF upsampler model...")
         for attr in ['image_cond_model_ss', 'image_cond_model_shape_512', 'image_cond_model_shape_1024', 'image_cond_model_tex_1024']:
         self._stage_desc = kwargs.get('desc', 'Processing')
         super().__init__(*args, **kwargs)
+    def set_description(self, desc=None, refresh=True):
+        self._stage_desc = desc or 'Processing'
+        super().set_description(desc, refresh)
     def update(self, n=1):
         super().update(n)
         _update_progress(self._stage_desc, self.n, self.total or 0)
 _fe_module.tqdm = _TqdmProgressInterceptor
 import trellis2.utils.render_utils as _ru_module
 _ru_module.tqdm = _TqdmProgressInterceptor
+import o_voxel.postprocess as _ovp_module
+_ovp_module.tqdm = _TqdmProgressInterceptor
 # ============================================================================
 # API Implementation
     mesh = pipeline.decode_latent(shape_slat, tex_slat, res)[0]
     _update_progress("Decoding latent", 1, 1)
     glb = o_voxel.postprocess.to_glb(
         vertices=mesh.vertices, faces=mesh.faces, attr_volume=mesh.attrs,
         coords=mesh.coords, attr_layout=pipeline.pbr_attr_layout,

autotune_cache.json CHANGED Viewed

@@ -24944,6 +24944,36 @@
                 "reg_inc_consumer": 0,
                 "maxnreg": null,
                 "pre_hook": null
             }
         },
         "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm.sparse_submanifold_conv_bwd_input_implicit_gemm_kernel": {

                 "reg_inc_consumer": 0,
                 "maxnreg": null,
                 "pre_hook": null
+            },
+            "(23, 7552645, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": {
+                "kwargs": {
+                    "BM": 16,
+                    "BK": 8
+                },
+                "num_warps": 2,
+                "num_ctas": 1,
+                "num_stages": 2,
+                "num_buffers_warp_spec": 0,
+                "num_consumer_groups": 0,
+                "reg_dec_producer": 0,
+                "reg_inc_consumer": 0,
+                "maxnreg": null,
+                "pre_hook": null
+            },
+            "(22, 7813095, 6, 8, 'torch.float32', 'torch.uint32', 'torch.float32', 'torch.float32')": {
+                "kwargs": {
+                    "BM": 16,
+                    "BK": 8
+                },
+                "num_warps": 2,
+                "num_ctas": 1,
+                "num_stages": 2,
+                "num_buffers_warp_spec": 0,
+                "num_consumer_groups": 0,
+                "reg_dec_producer": 0,
+                "reg_inc_consumer": 0,
+                "maxnreg": null,
+                "pre_hook": null
             }
         },
         "flex_gemm.kernels.triton.spconv.sparse_submanifold_conv_bwd_implicit_gemm.sparse_submanifold_conv_bwd_input_implicit_gemm_kernel": {

trellis2/datasets/components.py CHANGED Viewed

@@ -57,16 +57,16 @@ class StandardDatasetBase(Dataset):
                 self._stats[key] = {}
                 metadata = pd.DataFrame(columns=['sha256']).set_index('sha256')
-                # 只从 ss_latent 和 render_cond 合并关键字段
-                # 不包含 base，因为 base/metadata.csv 中的 cond_rendered=False 会错误覆盖真实值
                 for sub_key, r in root.items():
                     if sub_key == 'base':
-                        continue  # 跳过 base 目录
                     metadata_file = os.path.join(r, 'metadata.csv')
                     if os.path.exists(metadata_file):
                         metadata = metadata.combine_first(pd.read_csv(metadata_file).set_index('sha256'))
-                # 从 base 单独读取 aesthetic_score（不读取其他可能冲突的列）
                 if 'base' in root:
                     base_metadata_file = os.path.join(root['base'], 'metadata.csv')
                     if os.path.exists(base_metadata_file):

                 self._stats[key] = {}
                 metadata = pd.DataFrame(columns=['sha256']).set_index('sha256')
+                # Only merge key fields from ss_latent and render_cond
+                # Exclude base, because cond_rendered=False in base/metadata.csv would incorrectly overwrite real values
                 for sub_key, r in root.items():
                     if sub_key == 'base':
+                        continue  # Skip base directory
                     metadata_file = os.path.join(r, 'metadata.csv')
                     if os.path.exists(metadata_file):
                         metadata = metadata.combine_first(pd.read_csv(metadata_file).set_index('sha256'))
+                # Read aesthetic_score separately from base (avoid reading other potentially conflicting columns)
                 if 'base' in root:
                     base_metadata_file = os.path.join(root['base'], 'metadata.csv')
                     if os.path.exists(base_metadata_file):

trellis2/datasets/sparse_structure_latent.py CHANGED Viewed

@@ -349,7 +349,7 @@ class SparseStructureLatentView(SparseStructureLatentVisMixin, StandardDatasetBa
         if existing_view_cols:
             # Filter rows where all required views are encoded
-            # 注意：NaN 需要被视为 False，所以用 == True 显式比较
             has_all_views = (metadata[existing_view_cols] == True).all(axis=1)
             metadata = metadata[has_all_views]
             stats[f'With {self.num_views} view latents'] = len(metadata)

         if existing_view_cols:
             # Filter rows where all required views are encoded
+            # Note: NaN should be treated as False, so use == True for explicit comparison
             has_all_views = (metadata[existing_view_cols] == True).all(axis=1)
             metadata = metadata[has_all_views]
             stats[f'With {self.num_views} view latents'] = len(metadata)

trellis2/datasets/structured_latent_shape.py CHANGED Viewed

@@ -293,7 +293,7 @@ class SLatShapeView(SLatShapeVisMixin, SLat):
         if existing_view_cols:
             # Filter rows where all required views are encoded
-            # 注意：NaN 需要被视为 False，所以用 == True 显式比较
             has_all_views = (metadata[existing_view_cols] == True).all(axis=1)
             metadata = metadata[has_all_views]
             stats[f'With {self.num_views} view latents'] = len(metadata)

         if existing_view_cols:
             # Filter rows where all required views are encoded
+            # Note: NaN should be treated as False, so use == True for explicit comparison
             has_all_views = (metadata[existing_view_cols] == True).all(axis=1)
             metadata = metadata[has_all_views]
             stats[f'With {self.num_views} view latents'] = len(metadata)

trellis2/pipelines/pixal3d_image_to_3d.py CHANGED Viewed

@@ -14,9 +14,9 @@ class Pixal3DImageTo3DPipeline(Pipeline):
     """
     Pipeline for inferring Pixal3D (proj mode) image-to-3D models.
-    基于 Trellis2 pipeline，使用 proj 模式进行推理。
-    每个 stage (SS, Shape 512, Shape 1024, Tex 1024) 有独立的 image_cond_model (DinoV3ProjFeatureExtractor)。
-    条件构建使用 camera-aware projection（需要 camera_angle_x, distance, mesh_scale 参数）。
     Args:
         models (dict[str, nn.Module]): The models to use in the pipeline.
@@ -114,13 +114,13 @@ class Pixal3DImageTo3DPipeline(Pipeline):
         pipeline.shape_slat_normalization = args['shape_slat_normalization']
         pipeline.tex_slat_normalization = args['tex_slat_normalization']
-        # Proj mode: image_cond_models 需要外部加载后设置，这里先置为 None
         pipeline.image_cond_model_ss = None
         pipeline.image_cond_model_shape_512 = None
         pipeline.image_cond_model_shape_1024 = None
         pipeline.image_cond_model_tex_1024 = None
-        pipeline.rembg_model = getattr(rembg, args['rembg_model']['name'])(**args['rembg_model']['args'])
         pipeline.low_vram = args.get('low_vram', True)
         pipeline.default_pipeline_type = args.get('default_pipeline_type', '1024_cascade')
@@ -186,7 +186,7 @@ class Pixal3DImageTo3DPipeline(Pipeline):
         return output
     # =========================================================================
-    # Proj 模式条件构建
     # =========================================================================
     @torch.no_grad()
@@ -295,7 +295,7 @@ class Pixal3DImageTo3DPipeline(Pipeline):
         }
     # =========================================================================
-    # Sampling methods (保持与 Trellis2 一致)
     # =========================================================================
     def sample_sparse_structure(

     """
     Pipeline for inferring Pixal3D (proj mode) image-to-3D models.
+    Based on Trellis2 pipeline, using proj mode for inference.
+    Each stage (SS, Shape 512, Shape 1024, Tex 1024) has its own image_cond_model (DinoV3ProjFeatureExtractor).
+    Condition building uses camera-aware projection (requires camera_angle_x, distance, mesh_scale parameters).
     Args:
         models (dict[str, nn.Module]): The models to use in the pipeline.
         pipeline.shape_slat_normalization = args['shape_slat_normalization']
         pipeline.tex_slat_normalization = args['tex_slat_normalization']
+        # Proj mode: image_cond_models need to be loaded externally, set to None here
         pipeline.image_cond_model_ss = None
         pipeline.image_cond_model_shape_512 = None
         pipeline.image_cond_model_shape_1024 = None
         pipeline.image_cond_model_tex_1024 = None
+        pipeline.rembg_model = None  # Skip local RMBG loading; use remote client instead
         pipeline.low_vram = args.get('low_vram', True)
         pipeline.default_pipeline_type = args.get('default_pipeline_type', '1024_cascade')
         return output
     # =========================================================================
+    # Proj mode condition building
     # =========================================================================
     @torch.no_grad()
         }
     # =========================================================================
+    # Sampling methods (consistent with Trellis2)
     # =========================================================================
     def sample_sparse_structure(

trellis2/pipelines/trellis2_image_to_3d.py CHANGED Viewed

@@ -101,7 +101,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         pipeline.shape_slat_normalization = args['shape_slat_normalization']
         pipeline.tex_slat_normalization = args['tex_slat_normalization']
-        # HACK: 替换 dinov3 模型源为 camenduru 镜像
         image_cond_args = args['image_cond_model']['args'].copy()
         if image_cond_args.get('model_name') == 'facebook/dinov3-vitl16-pretrain-lvd1689m':
             image_cond_args['model_name'] = 'camenduru/dinov3-vitl16-pretrain-lvd1689m'

         pipeline.shape_slat_normalization = args['shape_slat_normalization']
         pipeline.tex_slat_normalization = args['tex_slat_normalization']
+        # HACK: Replace dinov3 model source with camenduru mirror
         image_cond_args = args['image_cond_model']['args'].copy()
         if image_cond_args.get('model_name') == 'facebook/dinov3-vitl16-pretrain-lvd1689m':
             image_cond_args['model_name'] = 'camenduru/dinov3-vitl16-pretrain-lvd1689m'

trellis2/trainers/basic.py CHANGED Viewed

@@ -491,7 +491,7 @@ class BasicTrainer:
         Finetune from a checkpoint.
         Should be called by all processes.
         """
-        # 允许缺失的 keys（如 register_buffer 的参数）
         ALLOWED_MISSING_KEYS = {'rope_phases'}
         if self.is_master:
@@ -508,7 +508,7 @@ class BasicTrainer:
                 # Remap checkpoint keys to handle structural changes (e.g., ProjectAttention wrapper)
                 model_ckpt = self._remap_checkpoint_keys(model_ckpt, model_state_dict)
-                # 检查多余的 keys（在 ckpt 中但不在 model 中）
                 for k, v in model_ckpt.items():
                     if k not in model_state_dict:
                         if self.is_master:
@@ -520,7 +520,7 @@ class BasicTrainer:
                         model_ckpt[k] = model_state_dict[k]
                 model_ckpt = {k: v for k, v in model_ckpt.items() if v is not None}
-                # 检查缺失的 keys（在 model 中但不在 ckpt 中）
                 missing_keys = set(model_state_dict.keys()) - set(model_ckpt.keys())
                 unexpected_missing = missing_keys - ALLOWED_MISSING_KEYS
                 if unexpected_missing and self.is_master:
@@ -529,7 +529,7 @@ class BasicTrainer:
                 if missing_keys & ALLOWED_MISSING_KEYS and self.is_master:
                     print(f'Info: Using model initialized values for: {missing_keys & ALLOWED_MISSING_KEYS}')
-                # 补充缺失的 keys（使用模型初始化值）
                 for k in missing_keys:
                     model_ckpt[k] = model_state_dict[k]
@@ -903,16 +903,16 @@ class BasicTrainer:
     def _verify_gradient_sync(self):
         """
-        验证 DDP 梯度同步是否真正生效。
-        DDP 的 backward 会自动对梯度进行 all_reduce，同步后所有卡的梯度应该完全相同。
-        验证方法：
-        1. 计算所有参数的总梯度 norm
-        2. 收集各卡的梯度 norm
-        3. 如果 DDP 同步正常，所有卡的梯度 norm 应该完全相同
-        4. 如果没有同步，各卡梯度 norm 会不同（因为各卡处理的数据不同）
         """
-        # 计算本卡所有参数的总梯度 norm
         total_grad_norm_sq = 0.0
         grad_count = 0
         for p in self.model_params:
@@ -925,16 +925,16 @@ class BasicTrainer:
         local_grad_norm = total_grad_norm_sq ** 0.5
-        # 确保所有进程到达同一点
         dist.barrier()
-        # 收集所有卡的梯度 norm
         grad_norm_tensor = torch.tensor([local_grad_norm], dtype=torch.float64, device=self.device)
         all_grad_norms = [torch.zeros(1, dtype=torch.float64, device=self.device) for _ in range(self.world_size)]
         dist.all_gather(all_grad_norms, grad_norm_tensor)
         all_grad_norms = [g.item() for g in all_grad_norms]
-        # 验证所有卡的梯度 norm 是否相同（使用相对误差，容忍 0.1%）
         ref_norm = all_grad_norms[0]
         if ref_norm > 0:
             is_synced = all(abs(g - ref_norm) / ref_norm < 1e-3 for g in all_grad_norms)
@@ -1010,7 +1010,7 @@ class BasicTrainer:
                     loss, status = self.training_losses(**mb_data)
                     l = loss['loss'] / len(data_list)
-                    # DEBUG: 打印每个 rank 的 loss
                     if self.debug:
                         print(f'[Rank {self.rank}/{self.world_size}] Step {self.step} batch {i}: loss={loss["loss"].item():.6f}')
@@ -1029,10 +1029,10 @@ class BasicTrainer:
                 elastic_controller_logs.append(self.elastic_controller.log())
         # ============================================================
-        # DEBUG: 验证 DDP 梯度同步
-        # 检查 backward 后各卡梯度是否一致
-        # DDP 在最后一个 batch_split 的 backward 时会自动 all_reduce 梯度
-        # 同步后所有卡的梯度应该完全相同
         # ============================================================
         if self.debug and self.world_size > 1:
             self._verify_gradient_sync()

         Finetune from a checkpoint.
         Should be called by all processes.
         """
+        # Allow missing keys (e.g., register_buffer parameters)
         ALLOWED_MISSING_KEYS = {'rope_phases'}
         if self.is_master:
                 # Remap checkpoint keys to handle structural changes (e.g., ProjectAttention wrapper)
                 model_ckpt = self._remap_checkpoint_keys(model_ckpt, model_state_dict)
+                # Check extra keys (in ckpt but not in model)
                 for k, v in model_ckpt.items():
                     if k not in model_state_dict:
                         if self.is_master:
                         model_ckpt[k] = model_state_dict[k]
                 model_ckpt = {k: v for k, v in model_ckpt.items() if v is not None}
+                # Check missing keys (in model but not in ckpt)
                 missing_keys = set(model_state_dict.keys()) - set(model_ckpt.keys())
                 unexpected_missing = missing_keys - ALLOWED_MISSING_KEYS
                 if unexpected_missing and self.is_master:
                 if missing_keys & ALLOWED_MISSING_KEYS and self.is_master:
                     print(f'Info: Using model initialized values for: {missing_keys & ALLOWED_MISSING_KEYS}')
+                # Fill in missing keys (using model initialized values)
                 for k in missing_keys:
                     model_ckpt[k] = model_state_dict[k]
     def _verify_gradient_sync(self):
         """
+        Verify that DDP gradient synchronization is working correctly.
+        DDP's backward automatically performs all_reduce on gradients; after sync all ranks should have identical gradients.
+        Verification method:
+        1. Compute total gradient norm across all parameters
+        2. Gather gradient norms from all ranks
+        3. If DDP sync is working, all ranks should have identical gradient norms
+        4. If not synced, gradient norms will differ (since each rank processes different data)
         """
+        # Compute total gradient norm on this rank
         total_grad_norm_sq = 0.0
         grad_count = 0
         for p in self.model_params:
         local_grad_norm = total_grad_norm_sq ** 0.5
+        # Ensure all processes reach the same point
         dist.barrier()
+        # Gather gradient norms from all ranks
         grad_norm_tensor = torch.tensor([local_grad_norm], dtype=torch.float64, device=self.device)
         all_grad_norms = [torch.zeros(1, dtype=torch.float64, device=self.device) for _ in range(self.world_size)]
         dist.all_gather(all_grad_norms, grad_norm_tensor)
         all_grad_norms = [g.item() for g in all_grad_norms]
+        # Verify all ranks have the same gradient norm (relative error tolerance: 0.1%)
         ref_norm = all_grad_norms[0]
         if ref_norm > 0:
             is_synced = all(abs(g - ref_norm) / ref_norm < 1e-3 for g in all_grad_norms)
                     loss, status = self.training_losses(**mb_data)
                     l = loss['loss'] / len(data_list)
+                    # DEBUG: Print loss for each rank
                     if self.debug:
                         print(f'[Rank {self.rank}/{self.world_size}] Step {self.step} batch {i}: loss={loss["loss"].item():.6f}')
                 elastic_controller_logs.append(self.elastic_controller.log())
         # ============================================================
+        # DEBUG: Verify DDP gradient synchronization
+        # Check if gradients are consistent across ranks after backward
+        # DDP automatically all_reduces gradients during the last batch_split's backward
+        # After sync, all ranks should have identical gradients
         # ============================================================
         if self.debug and self.world_size > 1:
             self._verify_gradient_sync()