somatosmpl / tests /test_device.py
zirobtc's picture
Upload folder using huggingface_hub
bd95c9c verified
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import os
import sys
import unittest
from pathlib import Path
import torch
from soma import SomaLayer
ASSETS_DIR = Path("assets")
class TestNovaDevice(unittest.TestCase):
def setUp(self):
self.data_root = "assets"
if not ASSETS_DIR.is_dir():
self.skipTest("Assets not found, skipping device test")
if not (ASSETS_DIR / "SOMA_neutral.npz").is_file():
self.skipTest("Core asset SOMA_neutral.npz not found")
# Dummy inputs (identity/scale shapes are set in _make_inputs after layer creation)
self.batch_size = 1
self.num_joints = 77 # Nova skeleton
self.pose = torch.zeros(self.batch_size, self.num_joints, 3)
self.transl = torch.zeros(self.batch_size, 3)
def _make_inputs(self, layer):
"""Build identity_coeffs and scale_params from the layer's identity model."""
im = layer.identity_model
identity_coeffs = torch.zeros(self.batch_size, im.num_identity_coeffs)
scale_params = None
if im.num_scale_params is not None:
scale_params = torch.zeros(self.batch_size, im.num_scale_params)
return identity_coeffs, scale_params
def test_cpu_initialization(self):
"""Test initializing on CPU."""
device = "cpu"
model = SomaLayer(data_root=self.data_root, device=device, identity_model_type="mhr")
identity_coeffs, scale_params = self._make_inputs(model)
# Check if parameters are on CPU
for param in model.parameters():
self.assertEqual(param.device.type, "cpu")
# Check if buffers are on CPU
for buffer in model.buffers():
self.assertEqual(buffer.device.type, "cpu")
# Run forward pass
out = model(
self.pose.to(device),
identity_coeffs.to(device),
scale_params.to(device),
self.transl.to(device),
)
self.assertTrue("vertices" in out)
def test_cuda0_only(self):
target_device = "cuda:0"
print(f"Initializing on {target_device}...")
model = SomaLayer(data_root=self.data_root, device=target_device, identity_model_type="mhr")
identity_coeffs, scale_params = self._make_inputs(model)
out = model(
self.pose.to(target_device),
identity_coeffs.to(target_device),
scale_params.to(target_device),
self.transl.to(target_device),
)
self.assertTrue("vertices" in out)
def test_cuda1_only(self):
if torch.cuda.device_count() < 2:
self.skipTest("Need 2 GPUs")
target_device = "cuda:1"
print(f"Initializing on {target_device}...")
model = SomaLayer(data_root=self.data_root, device=target_device, identity_model_type="mhr")
identity_coeffs, scale_params = self._make_inputs(model)
out = model(
self.pose.to(target_device),
identity_coeffs.to(target_device),
scale_params.to(target_device),
self.transl.to(target_device),
)
self.assertTrue("vertices" in out)
def test_move_to_gpu(self):
"""Test moving model from CPU to GPU."""
if not torch.cuda.is_available():
self.skipTest("CUDA not available")
device = "cpu"
if torch.cuda.is_available():
target_device = "cuda:0"
if torch.cuda.device_count() > 1:
target_device2 = "cuda:1"
print(f"\nInitializing on {device}...")
model = SomaLayer(data_root=self.data_root, device=device, identity_model_type="mhr")
identity_coeffs, scale_params = self._make_inputs(model)
print(f"Moving to {target_device}...")
model.to(target_device)
# Check if parameters moved
print("Checking parameters...")
for name, param in model.named_parameters():
self.assertEqual(param.device.type, "cuda", f"Parameter {name} is not on CUDA")
# Check if buffers moved
print("Checking buffers...")
for name, buffer in model.named_buffers():
self.assertEqual(buffer.device.type, "cuda", f"Buffer {name} is not on CUDA")
# CRITICAL: Check if the internal device attribute matches
# This is where we expect bad design to fail or be inconsistent
print(f"Model internal device attribute: {model.device}")
# Run forward pass on GPU
print("Running forward pass on GPU...")
try:
out = model(
self.pose.to(target_device),
identity_coeffs.to(target_device),
scale_params.to(target_device),
self.transl.to(target_device),
)
self.assertTrue("vertices" in out)
self.assertEqual(out["vertices"].device.type, "cuda")
except RuntimeError as e:
self.fail(f"Forward pass failed after moving to GPU: {e}")
if torch.cuda.device_count() > 1:
print(f"Moving to {target_device2}...")
model.to(target_device2)
out = model(
self.pose.to(target_device2),
identity_coeffs.to(target_device2),
scale_params.to(target_device2),
self.transl.to(target_device2),
)
def test_gpu_to_cpu_roundtrip(self):
"""Simulate DDP teardown: model moved from GPU back to CPU."""
if not torch.cuda.is_available():
self.skipTest("CUDA not available")
model = SomaLayer(data_root=self.data_root, device="cpu", identity_model_type="mhr")
identity_coeffs, scale_params = self._make_inputs(model)
model.to("cuda:0")
try:
model.cpu()
except RuntimeError as e:
self.fail(f"Moving SomaLayer from GPU to CPU failed (DDP teardown): {e}")
# Verify forward pass on CPU still works
out = model(self.pose, identity_coeffs, scale_params, self.transl)
self.assertTrue("vertices" in out)
class TestSkeletonTransferDevice(unittest.TestCase):
"""Unit tests for SkeletonTransfer device transfer — no assets required."""
def _make_skeleton_transfer(self, device="cpu"):
from soma.geometry.skeleton_transfer import SkeletonTransfer
J, V = 5, 20
# Pass joint_parent_ids as a tensor to exercise the device-mismatch bug.
joint_parent_ids = torch.tensor([0, 0, 1, 2, 3])
bind_world_transforms = torch.eye(4).unsqueeze(0).repeat(J, 1, 1)
bind_shape = torch.randn(V, 3)
skinning_weights = torch.rand(V, J)
skinning_weights /= skinning_weights.sum(dim=1, keepdim=True)
return SkeletonTransfer(
joint_parent_ids.to(device),
bind_world_transforms.to(device),
bind_shape.to(device),
skinning_weights.to(device),
use_warp_for_rotations=False,
use_sparse_rbf_matrix=False,
)
def test_init_with_tensor_joint_parent_ids(self):
"""joint_parent_ids passed as a CPU tensor must not cause device errors."""
st = self._make_skeleton_transfer("cpu")
self.assertIsNotNone(st.regressor_mask)
def test_gpu_to_cpu_roundtrip(self):
"""Simulates DDP teardown: SkeletonTransfer on GPU moved back to CPU."""
if not torch.cuda.is_available():
self.skipTest("CUDA not available")
st = self._make_skeleton_transfer("cpu")
st.to("cuda")
try:
st.cpu()
except RuntimeError as e:
self.fail(f"Moving SkeletonTransfer from GPU to CPU failed: {e}")
def test_cpu_to_gpu_roundtrip(self):
"""Moving from CPU to GPU and back must leave all buffers on CPU."""
if not torch.cuda.is_available():
self.skipTest("CUDA not available")
st = self._make_skeleton_transfer("cpu")
st.cuda()
st.cpu()
for name, buf in st.named_buffers():
if buf is not None:
self.assertEqual(
buf.device.type, "cpu", f"Buffer {name} not on CPU after round-trip"
)
def _ddp_worker(rank, world_size, data_root, broadcast_buffers):
"""Worker run in each DDP subprocess via mp.spawn."""
# Must be set before CUDA / Warp initialize in the child process
os.environ["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"
import warp as wp
wp.config.enable_mempools_at_init = False
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from soma import SomaLayer
os.environ.setdefault("MASTER_ADDR", "127.0.0.1")
os.environ.setdefault("MASTER_PORT", "29501")
dist.init_process_group("nccl", rank=rank, world_size=world_size)
try:
device = torch.device(f"cuda:{rank}")
soma = SomaLayer(data_root=data_root, device=device, identity_model_type="mhr")
# Wrap in a model that has learnable params (more realistic DDP use-case)
class _HeadModel(torch.nn.Module):
def __init__(self, soma_layer):
super().__init__()
self.soma = soma_layer
# Tiny learnable head so DDP has gradients to synchronize
self.head = torch.nn.Linear(3, 1)
def forward(self, pose, id_coeffs, scale_params, transl):
out = self.soma(pose, id_coeffs, scale_params=scale_params, transl=transl)
return self.head(out["vertices"].mean(dim=1)) # (B, 1)
model = _HeadModel(soma).to(device)
ddp_model = DDP(model, device_ids=[rank], broadcast_buffers=broadcast_buffers)
batch_size = 1
num_joints = 77
im = soma.identity_model
pose = torch.zeros(batch_size, num_joints, 3, device=device)
id_coeffs = torch.zeros(batch_size, im.num_identity_coeffs, device=device)
scale_params = torch.zeros(batch_size, im.num_scale_params, device=device)
transl = torch.zeros(batch_size, 3, device=device)
out = ddp_model(pose, id_coeffs, scale_params, transl)
loss = out.sum()
loss.backward() # syncs head gradients via DDP
finally:
dist.destroy_process_group()
class TestDDPCompatibility(unittest.TestCase):
"""Verify SomaLayer wrapped in DDP does not crash due to sparse tensor buffers."""
@classmethod
def setUpClass(cls):
if sys.platform != "linux":
raise unittest.SkipTest("DDP/NCCL tests require Linux")
if not ASSETS_DIR.is_dir():
raise unittest.SkipTest("Assets not found")
if not torch.cuda.is_available():
raise unittest.SkipTest("CUDA not available")
cls.data_root = "assets"
cls.world_size = min(2, torch.cuda.device_count())
def _run_ddp(self, broadcast_buffers):
import torch.multiprocessing as mp
mp.spawn(
_ddp_worker,
args=(self.world_size, self.data_root, broadcast_buffers),
nprocs=self.world_size,
join=True,
)
def test_ddp_broadcast_buffers_false(self):
"""DDP with broadcast_buffers=False: sparse tensor buffers not broadcast → should pass."""
self._run_ddp(broadcast_buffers=False)
def test_ddp_broadcast_buffers_true(self):
"""DDP with broadcast_buffers=True (default): sparse buffers are plain attrs → should pass."""
self._run_ddp(broadcast_buffers=True)
if __name__ == "__main__":
unittest.main()