Spaces:

lablab-ai-amd-developer-hackathon
/

movimento

Running on Zero

App Files Files Community

Kimodo Bot commited on 4 days ago

Commit

6d5047c

1 Parent(s): d6cb863

Add core kimodo package modules required by native demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

kimodo/__init__.py +11 -0
kimodo/assets.py +19 -0
kimodo/constraints.py +625 -0
kimodo/exports/__init__.py +65 -0
kimodo/exports/bvh.py +282 -0
kimodo/exports/motion_convert_lib.py +155 -0
kimodo/exports/motion_formats.py +78 -0
kimodo/exports/motion_io.py +443 -0
kimodo/exports/mujoco.py +588 -0
kimodo/exports/smplx.py +251 -0
kimodo/geometry.py +216 -0
kimodo/meta.py +80 -0
kimodo/metrics/__init__.py +39 -0
kimodo/metrics/base.py +66 -0
kimodo/metrics/constraints.py +87 -0
kimodo/metrics/foot_skate.py +232 -0
kimodo/metrics/tmr.py +530 -0
kimodo/model/__init__.py +31 -0
kimodo/model/backbone.py +312 -0
kimodo/model/cfg.py +133 -0
kimodo/model/common.py +48 -0
kimodo/model/diffusion.py +133 -0
kimodo/model/kimodo_model.py +605 -0
kimodo/model/llm2vec/README.md +1 -0
kimodo/model/llm2vec/__init__.py +11 -0
kimodo/model/llm2vec/llm2vec.py +477 -0
kimodo/model/llm2vec/llm2vec_wrapper.py +73 -0
kimodo/model/llm2vec/models/__init__.py +4 -0
kimodo/model/llm2vec/models/attn_mask_utils.py +181 -0
kimodo/model/llm2vec/models/bidirectional_llama.py +224 -0
kimodo/model/llm2vec/models/utils.py +32 -0
kimodo/model/load_model.py +194 -0
kimodo/model/loading.py +81 -0
kimodo/model/registry.py +473 -0
kimodo/model/text_encoder_api.py +74 -0
kimodo/model/tmr.py +382 -0
kimodo/model/twostage_denoiser.py +153 -0
kimodo/motion_rep/__init__.py +11 -0
kimodo/motion_rep/conditioning.py +28 -0
kimodo/motion_rep/feature_utils.py +212 -0
kimodo/motion_rep/feet.py +60 -0
kimodo/motion_rep/reps/__init__.py +13 -0
kimodo/motion_rep/reps/base.py +300 -0
kimodo/motion_rep/reps/kimodo_motionrep.py +301 -0
kimodo/motion_rep/reps/tmr_motionrep.py +222 -0
kimodo/motion_rep/smooth_root.py +234 -0
kimodo/motion_rep/stats.py +123 -0
kimodo/pipeline/__init__.py +28 -0
kimodo/pipeline/blend_quality.py +116 -0
kimodo/pipeline/scheduler_runtime.py +139 -0

kimodo/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Kimodo: text-driven and constrained motion generation model."""
+from .model.load_model import AVAILABLE_MODELS, DEFAULT_MODEL, load_model
+__all__ = [
+    "AVAILABLE_MODELS",
+    "DEFAULT_MODEL",
+    "load_model",
+]

kimodo/assets.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from pathlib import Path
+PACKAGE_ROOT = Path(__file__).resolve().parent
+ASSETS_ROOT = PACKAGE_ROOT / "assets"
+DEMO_ASSETS_ROOT = ASSETS_ROOT / "demo"
+DEMO_EXAMPLES_ROOT = DEMO_ASSETS_ROOT / "examples"
+SKELETONS_ROOT = ASSETS_ROOT / "skeletons"
+SOMA_ASSETS_ROOT = ASSETS_ROOT / "SOMA"
+def skeleton_asset_path(*parts: str) -> Path:
+    return SKELETONS_ROOT.joinpath(*parts)
+def demo_asset_path(*parts: str) -> Path:
+    return DEMO_ASSETS_ROOT.joinpath(*parts)

kimodo/constraints.py ADDED Viewed

	@@ -0,0 +1,625 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Constraint sets for conditioning motion generation (root 2D, full body, end-effectors)."""
+from typing import Optional, Union
+import torch
+from torch import Tensor
+from kimodo.motion_rep.feature_utils import compute_heading_angle
+from kimodo.skeleton import SkeletonBase, SOMASkeleton30, SOMASkeleton77
+from kimodo.tools import ensure_batched, load_json, save_json
+from .geometry import axis_angle_to_matrix, matrix_to_axis_angle
+def _convert_constraint_local_rots_to_skeleton(local_rot_mats: Tensor, skeleton: SkeletonBase) -> Tensor:
+    """Convert loaded local rotation matrices to match the skeleton's joint count.
+    Handles SOMA 30↔77: constraint files may have been saved with 30 or 77 joints while the session
+    skeleton (e.g. from the SOMA30 model) uses SOMASkeleton77.
+    """
+    n_joints = local_rot_mats.shape[-3]
+    skeleton_joints = skeleton.nbjoints
+    if n_joints == skeleton_joints:
+        return local_rot_mats
+    if n_joints == 77 and skeleton_joints == 30 and isinstance(skeleton, SOMASkeleton30):
+        return skeleton.from_SOMASkeleton77(local_rot_mats)
+    if n_joints == 30 and skeleton_joints == 77 and isinstance(skeleton, SOMASkeleton77):
+        skel30 = SOMASkeleton30()
+        return skel30.to_SOMASkeleton77(local_rot_mats)
+    raise ValueError(
+        f"Constraint joint count ({n_joints}) does not match skeleton joint count "
+        f"({skeleton_joints}). Only SOMA 30↔77 conversion is supported."
+    )
+def create_pairs(tensor_A: Tensor, tensor_B: Tensor) -> Tensor:
+    """Form all (a, b) pairs from two 1D tensors; output shape (len(A)*len(B), 2)."""
+    pairs = torch.stack(
+        (
+            tensor_A[:, None].expand(-1, len(tensor_B)),
+            tensor_B.expand(len(tensor_A), -1),
+        ),
+        dim=-1,
+    ).reshape(-1, 2)
+    return pairs
+def compute_global_heading(global_joints_positions: Tensor, skeleton: SkeletonBase) -> Tensor:
+    """Compute global root heading (cos, sin) from global joint positions using skeleton."""
+    root_heading_angle = compute_heading_angle(global_joints_positions, skeleton)
+    global_root_heading = torch.stack([torch.cos(root_heading_angle), torch.sin(root_heading_angle)], dim=-1)
+    return global_root_heading
+def _tensor_to(
+    t: Tensor,
+    device: Optional[Union[str, torch.device]] = None,
+    dtype: Optional[torch.dtype] = None,
+) -> Tensor:
+    """Move tensor to device and/or dtype.
+    Returns same tensor if no args.
+    """
+    if device is not None and dtype is not None:
+        return t.to(device=device, dtype=dtype)
+    if device is not None:
+        return t.to(device=device)
+    if dtype is not None:
+        return t.to(dtype=dtype)
+    return t
+class Root2DConstraintSet:
+    """Constraint set fixing root (x, z) trajectory and optionally global heading on given
+    frames."""
+    name = "root2d"
+    def __init__(
+        self,
+        skeleton: SkeletonBase,
+        frame_indices: Tensor,
+        smooth_root_2d: Tensor,
+        to_crop: bool = False,
+        global_root_heading: Optional[Tensor] = None,
+    ) -> None:
+        self.skeleton = skeleton
+        # if we pass the full smooth root 3D as input
+        if smooth_root_2d.shape[-1] == 3:
+            smooth_root_2d = smooth_root_2d[..., [0, 1]]
+        if to_crop:
+            smooth_root_2d = smooth_root_2d[frame_indices]
+            if global_root_heading is not None:
+                global_root_heading = global_root_heading[frame_indices]
+        else:
+            assert len(smooth_root_2d) == len(
+                frame_indices
+            ), "The number of smooth root 2d should be match the number of frames"
+            if global_root_heading is not None:
+                assert len(global_root_heading) == len(
+                    frame_indices
+                ), "The number of global root heading should be match the number of frames"
+        self.smooth_root_2d = smooth_root_2d
+        self.global_root_heading = global_root_heading
+        self.frame_indices = frame_indices
+    def update_constraints(self, data_dict: dict, index_dict: dict) -> None:
+        """Append this constraint's smooth_root_2d (and optional global_root_heading) to data/index
+        dicts."""
+        data_dict["smooth_root_2d"].append(self.smooth_root_2d)
+        index_dict["smooth_root_2d"].append(self.frame_indices)
+        if self.global_root_heading is not None:
+            # constraint the global heading
+            data_dict["global_root_heading"].append(self.global_root_heading)
+            index_dict["global_root_heading"].append(self.frame_indices)
+    def crop_move(self, start: int, end: int) -> "Root2DConstraintSet":
+        """Return a new constraint set for the cropped frame range [start, end)."""
+        mask = (self.frame_indices >= start) & (self.frame_indices < end)
+        if self.global_root_heading is not None:
+            masked_global_root_heading = self.global_root_heading[mask]
+        else:
+            masked_global_root_heading = None
+        return Root2DConstraintSet(
+            self.skeleton,
+            self.frame_indices[mask] - start,
+            self.smooth_root_2d[mask],
+            global_root_heading=masked_global_root_heading,
+        )
+    def get_save_info(self) -> dict:
+        """Return a dict suitable for JSON serialization (frame_indices, smooth_root_2d, optional
+        global_root_heading)."""
+        out = {
+            "type": self.name,
+            "frame_indices": self.frame_indices,
+            "smooth_root_2d": self.smooth_root_2d,
+        }
+        if self.global_root_heading is not None:
+            out["global_root_heading"] = self.global_root_heading
+        return out
+    def to(
+        self,
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> "Root2DConstraintSet":
+        self.smooth_root_2d = _tensor_to(self.smooth_root_2d, device, dtype)
+        self.frame_indices = _tensor_to(self.frame_indices, device, dtype)
+        if self.global_root_heading is not None:
+            self.global_root_heading = _tensor_to(self.global_root_heading, device, dtype)
+        if device is not None and hasattr(self.skeleton, "to"):
+            self.skeleton = self.skeleton.to(device)
+        return self
+    @classmethod
+    def from_dict(cls, skeleton: SkeletonBase, dico: dict) -> "Root2DConstraintSet":
+        """Build a Root2DConstraintSet from a dict (e.g. loaded from JSON)."""
+        device = skeleton.device if hasattr(skeleton, "device") else "cpu"
+        if "global_root_heading" in dico:
+            global_root_heading = torch.tensor(dico["global_root_heading"], device=device)
+        else:
+            global_root_heading = None
+        return cls(
+            skeleton,
+            frame_indices=torch.tensor(dico["frame_indices"]),
+            smooth_root_2d=torch.tensor(dico["smooth_root_2d"], device=device),
+            global_root_heading=global_root_heading,
+        )
+class FullBodyConstraintSet:
+    """Constraint set fixing full-body global positions and rotations on given keyframes."""
+    name = "fullbody"
+    def __init__(
+        self,
+        skeleton: SkeletonBase,
+        frame_indices: Tensor,
+        global_joints_positions: Tensor,
+        global_joints_rots: Tensor,
+        smooth_root_2d: Optional[Tensor] = None,
+        to_crop: bool = False,
+    ):
+        self.skeleton = skeleton
+        self.frame_indices = frame_indices
+        # if we pass the full smooth root 3D as input
+        if smooth_root_2d is not None and smooth_root_2d.shape[-1] == 3:
+            smooth_root_2d = smooth_root_2d[..., [0, 1]]
+        if to_crop:
+            global_joints_positions = global_joints_positions[frame_indices]
+            global_joints_rots = global_joints_rots[frame_indices]
+            if smooth_root_2d is not None:
+                smooth_root_2d = smooth_root_2d[frame_indices]
+        else:
+            assert len(global_joints_positions) == len(
+                frame_indices
+            ), "The number of global positions should be match the number of frames"
+            assert len(global_joints_rots) == len(
+                frame_indices
+            ), "The number of global joint rotations should be match the number of frames"
+            if smooth_root_2d is not None:
+                assert len(smooth_root_2d) == len(
+                    frame_indices
+                ), "The number of smooth root 2d (if specified) should be match the number of frames"
+        if smooth_root_2d is None:
+            # substitute the smooth root 2d with the real root
+            smooth_root_2d = global_joints_positions[:, skeleton.root_idx, [0, 2]]
+        # root y: from smooth or pelvis is the same
+        self.root_y_pos = global_joints_positions[:, skeleton.root_idx, 1]
+        self.global_joints_positions = global_joints_positions
+        self.global_joints_rots = global_joints_rots
+        self.global_root_heading = compute_global_heading(global_joints_positions, skeleton)
+        self.smooth_root_2d = smooth_root_2d
+    def update_constraints(self, data_dict: dict, index_dict: dict) -> None:
+        """Append global positions, smooth root 2D, root y, and global heading to data/index
+        dicts."""
+        nbjoints = self.skeleton.nbjoints
+        indices_lst = create_pairs(
+            self.frame_indices,
+            torch.arange(nbjoints, device=self.frame_indices.device),
+        )
+        data_dict["global_joints_positions"].append(
+            self.global_joints_positions.reshape(-1, 3)
+        )  # flatten the global positions
+        index_dict["global_joints_positions"].append(indices_lst)
+        # global rotations are not used here
+        # as we use smooth root, also constraint the smooth root to get the same full body
+        # maybe keep storing the hips offset, if we smooth it ourselves
+        data_dict["smooth_root_2d"].append(self.smooth_root_2d)
+        index_dict["smooth_root_2d"].append(self.frame_indices)
+        # constraint the y pos of the root
+        data_dict["root_y_pos"].append(self.root_y_pos)
+        index_dict["root_y_pos"].append(self.frame_indices)
+        # constraint the global heading
+        data_dict["global_root_heading"].append(self.global_root_heading)
+        index_dict["global_root_heading"].append(self.frame_indices)
+    def crop_move(self, start: int, end: int) -> "FullBodyConstraintSet":
+        """Return a new FullBodyConstraintSet for the cropped frame range [start, end)."""
+        mask = (self.frame_indices >= start) & (self.frame_indices < end)
+        return FullBodyConstraintSet(
+            self.skeleton,
+            self.frame_indices[mask] - start,
+            self.global_joints_positions[mask],
+            self.global_joints_rots[mask],
+            self.smooth_root_2d[mask],
+        )
+    def get_save_info(self) -> dict:
+        """Return a dict for JSON save: type, frame_indices, local_joints_rot, root_positions, smooth_root_2d."""
+        local_joints_rot = self.skeleton.global_rots_to_local_rots(self.global_joints_rots)
+        if isinstance(self.skeleton, SOMASkeleton30):
+            local_joints_rot = self.skeleton.to_SOMASkeleton77(local_joints_rot)
+        local_joints_rot = matrix_to_axis_angle(local_joints_rot)
+        root_positions = self.global_joints_positions[:, self.skeleton.root_idx]
+        return {
+            "type": self.name,
+            "frame_indices": self.frame_indices,
+            "local_joints_rot": local_joints_rot,
+            "root_positions": root_positions,
+            "smooth_root_2d": self.smooth_root_2d,
+        }
+    def to(
+        self,
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> "FullBodyConstraintSet":
+        self.frame_indices = _tensor_to(self.frame_indices, device, dtype)
+        self.global_joints_positions = _tensor_to(self.global_joints_positions, device, dtype)
+        self.global_joints_rots = _tensor_to(self.global_joints_rots, device, dtype)
+        self.root_y_pos = _tensor_to(self.root_y_pos, device, dtype)
+        self.global_root_heading = _tensor_to(self.global_root_heading, device, dtype)
+        self.smooth_root_2d = _tensor_to(self.smooth_root_2d, device, dtype)
+        if device is not None and hasattr(self.skeleton, "to"):
+            self.skeleton = self.skeleton.to(device)
+        return self
+    @classmethod
+    def from_dict(cls, skeleton: SkeletonBase, dico: dict) -> "FullBodyConstraintSet":
+        """Build a FullBodyConstraintSet from a dict (e.g. loaded from JSON)."""
+        frame_indices = torch.tensor(dico["frame_indices"])
+        device = skeleton.device if hasattr(skeleton, "device") else "cpu"
+        local_rot = torch.tensor(dico["local_joints_rot"], device=device)
+        local_rot_mats = axis_angle_to_matrix(local_rot)
+        local_rot_mats = _convert_constraint_local_rots_to_skeleton(local_rot_mats, skeleton)
+        global_joints_rots, global_joints_positions, _ = skeleton.fk(
+            local_rot_mats,
+            torch.tensor(dico["root_positions"], device=device),
+        )
+        smooth_root_2d = None
+        if "smooth_root_2d" in dico:
+            smooth_root_2d = torch.tensor(dico["smooth_root_2d"], device=device)
+        return cls(
+            skeleton,
+            frame_indices=frame_indices,
+            global_joints_positions=global_joints_positions,
+            global_joints_rots=global_joints_rots,
+            smooth_root_2d=smooth_root_2d,
+        )
+class EndEffectorConstraintSet:
+    """Constraint set fixing selected end-effector positions and rotations on given frames."""
+    name = "end-effector"
+    def __init__(
+        self,
+        skeleton: SkeletonBase,
+        frame_indices: Tensor,
+        global_joints_positions: Tensor,
+        global_joints_rots: Tensor,
+        smooth_root_2d: Optional[Tensor],
+        *,
+        joint_names: list[str],
+        to_crop: bool = False,
+    ) -> None:
+        self.skeleton = skeleton
+        self.frame_indices = frame_indices
+        self.joint_names = joint_names
+        # joint_names are constant for all the frames
+        rot_joint_names, pos_joint_names = self.skeleton.expand_joint_names(self.joint_names)
+        # indexing works for motion_rep with smooth root only (contains pelvis index)
+        self.pos_indices = torch.tensor([self.skeleton.bone_index[jname] for jname in pos_joint_names])
+        self.rot_indices = torch.tensor([self.skeleton.bone_index[jname] for jname in rot_joint_names])
+        # if we pass the full smooth root 3D as input
+        if smooth_root_2d is not None and smooth_root_2d.shape[-1] == 3:
+            smooth_root_2d = smooth_root_2d[..., [0, 1]]
+        if to_crop:
+            global_joints_positions = global_joints_positions[frame_indices]
+            global_joints_rots = global_joints_rots[frame_indices]
+            if smooth_root_2d is not None:
+                smooth_root_2d = smooth_root_2d[frame_indices]
+        else:
+            assert len(global_joints_positions) == len(
+                frame_indices
+            ), "The number of global positions should be match the number of frames"
+            assert len(global_joints_rots) == len(
+                frame_indices
+            ), "The number of global joint rotations should be match the number of frames"
+            if smooth_root_2d is not None:
+                assert len(smooth_root_2d) == len(
+                    frame_indices
+                ), "The number of smooth root 2d (if specified) should be match the number of frames"
+        if smooth_root_2d is None:
+            # substitute the smooth root 2d with the real root
+            smooth_root_2d = global_joints_positions[:, skeleton.root_idx, [0, 2]]
+        # root y: from smooth or pelvis is the same
+        self.root_y_pos = global_joints_positions[:, skeleton.root_idx, 1]
+        self.global_joints_positions = global_joints_positions
+        self.global_root_heading = compute_global_heading(global_joints_positions, skeleton)
+        self.global_joints_rots = global_joints_rots
+        self.smooth_root_2d = smooth_root_2d
+    def update_constraints(self, data_dict: dict, index_dict: dict) -> None:
+        """Append constrained joint positions/rots, smooth root 2D, root y, and heading to
+        data/index dicts."""
+        crop_frames_indexing = torch.arange(len(self.frame_indices), device=self.frame_indices.device)
+        # constraint positions
+        pos_indices_real = create_pairs(
+            self.frame_indices,
+            self.pos_indices,
+        )
+        pos_indices_crop = create_pairs(
+            crop_frames_indexing,
+            self.pos_indices,
+        )
+        data_dict["global_joints_positions"].append(self.global_joints_positions[tuple(pos_indices_crop.T)])
+        index_dict["global_joints_positions"].append(pos_indices_real)
+        # constraint rotations
+        rot_indices_real = create_pairs(
+            self.frame_indices,
+            self.rot_indices,
+        )
+        rot_indices_crop = create_pairs(
+            crop_frames_indexing,
+            self.rot_indices,
+        )
+        data_dict["global_joints_rots"].append(self.global_joints_rots[tuple(rot_indices_crop.T)])
+        index_dict["global_joints_rots"].append(rot_indices_real)
+        # as we use smooth root, also constraint the smooth root to get the same full body
+        # maybe keep storing the hips offset, if we smooth it ourselves
+        data_dict["smooth_root_2d"].append(self.smooth_root_2d)
+        index_dict["smooth_root_2d"].append(self.frame_indices)
+        # constraint the y pos of the root
+        data_dict["root_y_pos"].append(self.root_y_pos)
+        index_dict["root_y_pos"].append(self.frame_indices)
+        # constraint the global heading
+        data_dict["global_root_heading"].append(self.global_root_heading)
+        index_dict["global_root_heading"].append(self.frame_indices)
+    def crop_move(self, start: int, end: int) -> "EndEffectorConstraintSet":
+        """Return a new EndEffectorConstraintSet for the cropped frame range [start, end)."""
+        mask = (self.frame_indices >= start) & (self.frame_indices < end)
+        cls = type(self)
+        kwargs = {}
+        if not hasattr(cls, "joint_names"):
+            kwargs["joint_names"] = self.joint_names
+        return cls(
+            self.skeleton,
+            self.frame_indices[mask] - start,
+            self.global_joints_positions[mask],
+            self.global_joints_rots[mask],
+            self.smooth_root_2d[mask],
+            **kwargs,
+        )
+    def get_save_info(self) -> dict:
+        """Return a dict for JSON save: type, frame_indices, local_joints_rot, root_positions, smooth_root_2d, joint_names."""
+        local_joints_rot = self.skeleton.global_rots_to_local_rots(self.global_joints_rots)
+        if isinstance(self.skeleton, SOMASkeleton30):
+            local_joints_rot = self.skeleton.to_SOMASkeleton77(local_joints_rot)
+        local_joints_rot = matrix_to_axis_angle(local_joints_rot)
+        root_positions = self.global_joints_positions[:, self.skeleton.root_idx]
+        output = {
+            "type": self.name,
+            "frame_indices": self.frame_indices,
+            "local_joints_rot": local_joints_rot,
+            "root_positions": root_positions,
+            "smooth_root_2d": self.smooth_root_2d,
+        }
+        if not hasattr(self.__class__, "joint_names"):
+            # save the joint_names for this base class
+            # but not for children
+            output["joint_names"] = self.joint_names
+        return output
+    def to(
+        self,
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> "EndEffectorConstraintSet":
+        self.frame_indices = _tensor_to(self.frame_indices, device, dtype)
+        self.pos_indices = _tensor_to(self.pos_indices, device, dtype)
+        self.rot_indices = _tensor_to(self.rot_indices, device, dtype)
+        self.root_y_pos = _tensor_to(self.root_y_pos, device, dtype)
+        self.global_joints_positions = _tensor_to(self.global_joints_positions, device, dtype)
+        self.global_root_heading = _tensor_to(self.global_root_heading, device, dtype)
+        self.global_joints_rots = _tensor_to(self.global_joints_rots, device, dtype)
+        self.smooth_root_2d = _tensor_to(self.smooth_root_2d, device, dtype)
+        if device is not None and hasattr(self.skeleton, "to"):
+            self.skeleton = self.skeleton.to(device)
+        return self
+    @classmethod
+    def from_dict(cls, skeleton: SkeletonBase, dico: dict) -> "EndEffectorConstraintSet":
+        """Build an EndEffectorConstraintSet from a dict (e.g. loaded from JSON)."""
+        frame_indices = torch.tensor(dico["frame_indices"])
+        device = skeleton.device if hasattr(skeleton, "device") else "cpu"
+        local_rot = torch.tensor(dico["local_joints_rot"], device=device)
+        local_rot_mats = axis_angle_to_matrix(local_rot)
+        local_rot_mats = _convert_constraint_local_rots_to_skeleton(local_rot_mats, skeleton)
+        global_joints_rots, global_joints_positions, _ = skeleton.fk(
+            local_rot_mats,
+            torch.tensor(dico["root_positions"], device=device),
+        )
+        smooth_root_2d = None
+        if "smooth_root_2d" in dico:
+            smooth_root_2d = torch.tensor(dico["smooth_root_2d"], device=device)
+        kwargs = {}
+        if not hasattr(cls, "joint_names"):
+            kwargs["joint_names"] = dico["joint_names"]
+        return cls(
+            skeleton,
+            frame_indices=frame_indices,
+            global_joints_positions=global_joints_positions,
+            global_joints_rots=global_joints_rots,
+            smooth_root_2d=smooth_root_2d,
+            **kwargs,
+        )
+class LeftHandConstraintSet(EndEffectorConstraintSet):
+    """End-effector constraint for the left hand only."""
+    name = "left-hand"
+    joint_names: list[str] = ["LeftHand"]
+    def __init__(self, *args, **kwargs: dict):
+        super().__init__(*args, joint_names=self.joint_names, **kwargs)
+class RightHandConstraintSet(EndEffectorConstraintSet):
+    """End-effector constraint for the right hand only."""
+    name = "right-hand"
+    joint_names: list[str] = ["RightHand"]
+    def __init__(self, *args, **kwargs: dict):
+        super().__init__(*args, joint_names=self.joint_names, **kwargs)
+class LeftFootConstraintSet(EndEffectorConstraintSet):
+    """End-effector constraint for the left foot only."""
+    name = "left-foot"
+    joint_names: list[str] = ["LeftFoot"]
+    def __init__(self, *args, **kwargs: dict):
+        super().__init__(*args, joint_names=self.joint_names, **kwargs)
+class RightFootConstraintSet(EndEffectorConstraintSet):
+    """End-effector constraint for the right foot only."""
+    name = "right-foot"
+    joint_names: list[str] = ["RightFoot"]
+    def __init__(self, *args, **kwargs: dict):
+        super().__init__(*args, joint_names=self.joint_names, **kwargs)
+TYPE_TO_CLASS = {
+    "root2d": Root2DConstraintSet,
+    "fullbody": FullBodyConstraintSet,
+    "left-hand": LeftHandConstraintSet,
+    "right-hand": RightHandConstraintSet,
+    "left-foot": LeftFootConstraintSet,
+    "right-foot": RightFootConstraintSet,
+    "end-effector": EndEffectorConstraintSet,
+}
+def load_constraints_lst(
+    path_or_data: str | list,
+    skeleton: SkeletonBase,
+    device: Optional[Union[str, torch.device]] = None,
+    dtype: Optional[torch.dtype] = None,
+):
+    """Load a list of constraints from JSON path or list of dicts.
+    Args:
+        path_or_data: Path to constraints.json or list of constraint dicts.
+        skeleton: Skeleton instance (used for from_dict).
+        device: If set, move all constraint tensors and skeleton to this device.
+        dtype: If set, cast constraint tensors to this dtype.
+    """
+    if isinstance(path_or_data, str):
+        saved = load_json(path_or_data)
+    else:
+        saved = path_or_data
+    constraints_lst = []
+    for el in saved:
+        cls = TYPE_TO_CLASS[el["type"]]
+        c = cls.from_dict(skeleton, el)
+        if device is not None or dtype is not None:
+            c.to(device=device, dtype=dtype)
+        constraints_lst.append(c)
+    return constraints_lst
+def save_constraints_lst(path: str, constraints_lst: list) -> list | None:
+    """Save a list of constraint sets to a JSON file.
+    Returns None if list is empty.
+    """
+    if not constraints_lst:
+        print("The constraints lst is empty. Skip saving")
+        return
+    to_save = []
+    def tensor_to_list(obj):
+        """Recursively convert tensors to lists for JSON serialization."""
+        if isinstance(obj, Tensor):
+            return obj.cpu().tolist()
+        elif isinstance(obj, dict):
+            return {k: tensor_to_list(v) for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [tensor_to_list(v) for v in obj]
+        else:
+            return obj
+    for constraint in constraints_lst:
+        constraint_info = constraint.get_save_info()
+        # Convert all tensors to lists for JSON serialization
+        constraint_info = tensor_to_list(constraint_info)
+        to_save.append(constraint_info)
+    save_json(path, to_save)
+    print(f"Saved constraints to {path}")
+    return to_save

kimodo/exports/__init__.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Export utilities: MuJoCo, BVH, SMPLX/AMASS, and motion I/O helpers."""
+from .bvh import bvh_to_kimodo_motion, motion_to_bvh_bytes, read_bvh_frame_time_seconds, save_motion_bvh
+from .motion_convert_lib import convert_motion_files
+from .motion_formats import (
+    infer_npz_kind,
+    infer_source_format_from_path,
+    infer_target_format_from_path,
+    resolve_source_fps,
+)
+from .motion_io import (
+    KIMODO_CONVERT_TARGET_FPS,
+    amass_npz_to_bytes,
+    complete_motion_dict,
+    g1_csv_to_bytes,
+    kimodo_npz_to_bytes,
+    load_amass_npz,
+    load_g1_csv,
+    load_kimodo_npz,
+    load_kimodo_npz_as_torch,
+    load_motion_file,
+    motion_dict_to_numpy,
+    save_kimodo_npz,
+    save_kimodo_npz_at_target_fps,
+)
+from .mujoco import MujocoQposConverter, apply_g1_real_robot_projection
+from .smplx import (
+    AMASSConverter,
+    amass_npz_to_kimodo_motion,
+    get_amass_parameters,
+    kimodo_y_up_to_amass_coord_rotation_matrix,
+)
+__all__ = [
+    "AMASSConverter",
+    "KIMODO_CONVERT_TARGET_FPS",
+    "MujocoQposConverter",
+    "amass_npz_to_bytes",
+    "amass_npz_to_kimodo_motion",
+    "apply_g1_real_robot_projection",
+    "bvh_to_kimodo_motion",
+    "complete_motion_dict",
+    "convert_motion_files",
+    "g1_csv_to_bytes",
+    "get_amass_parameters",
+    "infer_npz_kind",
+    "infer_source_format_from_path",
+    "infer_target_format_from_path",
+    "kimodo_npz_to_bytes",
+    "kimodo_y_up_to_amass_coord_rotation_matrix",
+    "load_amass_npz",
+    "load_g1_csv",
+    "load_kimodo_npz",
+    "load_kimodo_npz_as_torch",
+    "load_motion_file",
+    "motion_dict_to_numpy",
+    "motion_to_bvh_bytes",
+    "read_bvh_frame_time_seconds",
+    "resolve_source_fps",
+    "save_kimodo_npz",
+    "save_kimodo_npz_at_target_fps",
+    "save_motion_bvh",
+]

kimodo/exports/bvh.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Export utilities for converting internal motion representations into common file formats.
+This module is intended to hold lightweight serialization / export helpers that can be reused
+outside of interactive demos.
+"""
+import os
+import tempfile
+from pathlib import Path
+from typing import Tuple, Union
+import numpy as np
+import torch
+from kimodo.geometry import matrix_to_quaternion as _matrix_to_quaternion
+def _strip_end_site_blocks(bvh_text: str) -> str:
+    """Remove all 'End Site { ... }' blocks from BVH text so output matches original format.
+    bvhio adds an End Site for every leaf joint when writing; we do not set EndSite on joints, so we
+    post-process the string to remove these blocks for Blender/original compatibility.
+    """
+    lines = bvh_text.splitlines(keepends=True)
+    result = []
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        if "End Site" in line:
+            # Skip this line and the following block { ... }; brace-count to find closing }
+            i += 1
+            if i < len(lines) and "{" in lines[i]:
+                i += 1
+                depth = 1
+                while i < len(lines) and depth > 0:
+                    if "{" in lines[i]:
+                        depth += 1
+                    if "}" in lines[i]:
+                        depth -= 1
+                    i += 1
+            continue
+        result.append(line)
+        i += 1
+    return "".join(result)
+def _coerce_batch(name: str, x: torch.Tensor, *, expected_ndim: int) -> torch.Tensor:
+    """Coerce (T, ...) or (1, T, ...) into (T, ...)."""
+    if x.ndim == expected_ndim:
+        return x
+    if x.ndim == expected_ndim + 1:
+        if int(x.shape[0]) != 1:
+            raise ValueError(
+                f"{name} has batch dimension B={int(x.shape[0])}, but BVH export " "only supports a single clip (B==1)."
+            )
+        return x[0]
+    raise ValueError(f"{name} must have shape (T, ...) or (1, T, ...); got {tuple(x.shape)}")
+def motion_to_bvh(
+    local_rot_mats: torch.Tensor,
+    root_positions: torch.Tensor,
+    *,
+    skeleton,
+    fps: float,
+) -> str:
+    """Convert local rotations and root positions to BVH format; return UTF-8 string.
+    Args:
+        local_rot_mats: (T, J, 3, 3) or (1, T, J, 3, 3) local rotation matrices.
+        root_positions: (T, 3) or (1, T, 3) root joint positions (e.g. from posed joints).
+        skeleton: Skeleton with bone_order_names, bvh_neutral_joints, etc.
+        fps: Frames per second for the motion.
+    Notes:
+        BVH is plain-text. Root is named "Root" with ZYX rotation order; leaf joints
+        have no End Site block.
+    """
+    try:
+        import bvhio  # type: ignore[import-not-found]
+        import glm  # type: ignore[import-not-found]
+        from SpatialTransform import Pose  # type: ignore[import-not-found]
+    except Exception as e:  # pragma: no cover
+        raise ImportError(
+            "BVH export requires `bvhio` (and its deps `PyGLM` + `SpatialTransform`). "
+            "Install with: `pip install bvhio`."
+        ) from e
+    local_rot_mats = local_rot_mats.detach()
+    root_positions = root_positions.detach()
+    # SOMA: accept either somaskel30 (convert to 77) or somaskel77 (use as-is)
+    if skeleton.name == "somaskel30":
+        local_rot_mats = skeleton.to_SOMASkeleton77(local_rot_mats)
+        skeleton = skeleton.somaskel77
+    local_rot_mats, _ = skeleton.from_standard_tpose(local_rot_mats)
+    neutral = skeleton.bvh_neutral_joints.detach().cpu().numpy()
+    joint_names = list(skeleton.bone_order_names)
+    parents = skeleton.joint_parents.detach().cpu().numpy().astype(int)
+    root_idx = int(skeleton.root_idx)
+    local_rot_mats = _coerce_batch("local_rot_mats", local_rot_mats, expected_ndim=4)
+    T, J = local_rot_mats.shape[:2]
+    q_wxyz = _matrix_to_quaternion(local_rot_mats).detach().cpu().numpy()  # [T, J, 4]
+    root_xyz = _coerce_batch("root_positions", root_positions, expected_ndim=2)
+    root_xyz = root_xyz.cpu().numpy()  # [T, 3]
+    # Build BVH hierarchy: Root (wrapper at origin) -> Hips (pelvis with offset in meters) -> ...
+    # Offsets are in meters to match the original format.
+    children: dict[int, list[int]] = {i: [] for i in range(J)}
+    for i, p in enumerate(parents):
+        if p >= 0:
+            children[int(p)].append(int(i))
+    _ROOT_CHANNELS = [
+        "Xposition",
+        "Yposition",
+        "Zposition",
+        "Zrotation",
+        "Yrotation",
+        "Xrotation",
+    ]
+    _JOINT_CHANNELS = ["Zrotation", "Yrotation", "Xrotation"]
+    # Scale from meters to centimeters (match original BVH scale).
+    neutral = neutral * 100
+    root_xyz = root_xyz * 100
+    # Hips offset from Root: use skeleton neutral; if root is at origin (zeros), use a
+    # nominal pelvis height so the hierarchy is non-degenerate in Blender.
+    hips_offset = neutral[root_idx]
+    if (hips_offset == 0).all():
+        hips_offset = np.array([0.0, 100.0, 0.0], dtype=neutral.dtype)  # 1 m in cm
+    def _make_joint(i: int) -> "bvhio.BvhJoint":
+        name = joint_names[i]
+        j = bvhio.BvhJoint(name, offset=glm.vec3(0, 0, 0))
+        if i == root_idx:
+            # Hips: offset from Root (origin) in cm
+            off = hips_offset
+            j.Offset = glm.vec3(float(off[0]), float(off[1]), float(off[2]))
+            j.Channels = _ROOT_CHANNELS.copy()
+        else:
+            p = int(parents[i])
+            off = neutral[i] - neutral[p]
+            j.Offset = glm.vec3(float(off[0]), float(off[1]), float(off[2]))
+            j.Channels = _JOINT_CHANNELS.copy()
+        for c in children[i]:
+            j.Children.append(_make_joint(c))
+        return j
+    # Wrapper Root at origin; single child is Hips (skeleton root).
+    root_wrapper = bvhio.BvhJoint("Root", offset=glm.vec3(0.0, 0.0, 0.0))
+    root_wrapper.Channels = _ROOT_CHANNELS.copy()
+    root_wrapper.Children.append(_make_joint(root_idx))
+    root_joint = root_wrapper
+    # Populate keyframes: Root = identity/zero, Hips = root motion, others = local rotation.
+    bvh_layout = root_joint.layout()
+    name_to_id = {n: idx for idx, n in enumerate(joint_names)}
+    ordered_joint_ids = []
+    for bj, _, _ in bvh_layout:
+        if bj.Name == "Root":
+            ordered_joint_ids.append(None)
+        else:
+            ordered_joint_ids.append(name_to_id[bj.Name])
+    bvh_joints = [bj for bj, _, _ in bvh_layout]
+    for bj in bvh_joints:
+        bj.Keyframes = [None] * T  # type: ignore[list-item]
+    identity_quat = glm.quat(1.0, 0.0, 0.0, 0.0)
+    zero_vec = glm.vec3(0.0, 0.0, 0.0)
+    for t in range(T):
+        for bj, jid in zip(bvh_joints, ordered_joint_ids):
+            if jid is None:
+                position = zero_vec
+                rotation = identity_quat
+            elif jid == root_idx:
+                pos = root_xyz[t]
+                position = glm.vec3(float(pos[0]), float(pos[1]), float(pos[2]))
+                qw, qx, qy, qz = q_wxyz[t, jid]
+                rotation = glm.quat(float(qw), float(qx), float(qy), float(qz))
+            else:
+                position = zero_vec
+                qw, qx, qy, qz = q_wxyz[t, jid]
+                rotation = glm.quat(float(qw), float(qx), float(qy), float(qz))
+            bj.Keyframes[t] = Pose(position, rotation)  # type: ignore[index]
+    container = bvhio.BvhContainer(root_joint, frameCount=T, frameTime=1.0 / float(fps))
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".bvh", delete=False, encoding="utf-8") as f:
+        tmp_path = f.name
+    try:
+        bvhio.writeBvh(tmp_path, container, percision=6)
+        bvh_text = Path(tmp_path).read_text(encoding="utf-8")
+        return _strip_end_site_blocks(bvh_text)
+    finally:
+        try:
+            os.remove(tmp_path)
+        except Exception:
+            pass
+def motion_to_bvh_bytes(
+    local_rot_mats: torch.Tensor,
+    root_positions: torch.Tensor,
+    *,
+    skeleton,
+    fps: float,
+) -> bytes:
+    """Convert local rotations and root positions to BVH bytes (UTF-8).
+    Convenience wrapper around :func:`motion_to_bvh`.
+    """
+    return motion_to_bvh(local_rot_mats, root_positions, skeleton=skeleton, fps=fps).encode("utf-8")
+def save_motion_bvh(
+    path: Union[str, Path],
+    local_rot_mats: torch.Tensor,
+    root_positions: torch.Tensor,
+    *,
+    skeleton,
+    fps: float,
+) -> None:
+    """Write local rotations and root positions to a BVH file at the given path."""
+    Path(path).write_text(
+        motion_to_bvh(local_rot_mats, root_positions, skeleton=skeleton, fps=fps),
+        encoding="utf-8",
+    )
+def read_bvh_frame_time_seconds(path: Union[str, Path]) -> float:
+    """Read ``Frame Time`` from a BVH file (seconds per frame)."""
+    with open(path, encoding="utf-8") as f:
+        for line in f:
+            if "Frame Time:" in line:
+                parts = line.split()
+                return float(parts[-1])
+    raise ValueError(f"Could not find 'Frame Time:' in {path}")
+def bvh_to_kimodo_motion(
+    path: Union[str, Path],
+    skeleton=None,
+) -> Tuple:
+    """Load a Kimodo-style SOMA BVH into a Kimodo motion dict.
+    Expects the same hierarchy as :func:`save_motion_bvh` (``Root`` wrapper + SOMA77 joints).
+    The frame rate is always read from the BVH ``Frame Time`` header.  Callers
+    that need a different playback rate should resample the returned motion dict
+    (see :func:`~kimodo.exports.motion_io.resample_motion_dict_to_kimodo_fps`).
+    Returns:
+        ``(motion_dict, source_fps)`` where ``source_fps`` is the native BVH
+        frame rate read from the file header.
+    """
+    from kimodo.exports.motion_io import complete_motion_dict
+    from kimodo.skeleton.bvh import parse_bvh_motion
+    from kimodo.skeleton.registry import build_skeleton
+    if skeleton is None:
+        skeleton = build_skeleton(77)
+    device = skeleton.neutral_joints.device
+    local_rot_mats, root_trans, bvh_fps = parse_bvh_motion(str(path))
+    local_rot_mats = local_rot_mats.to(device=device)
+    root_trans = root_trans.to(device=device)
+    if int(local_rot_mats.shape[1]) != int(skeleton.nbjoints):
+        raise ValueError(
+            f"BVH has {local_rot_mats.shape[1]} joints but skeleton has {skeleton.nbjoints}; "
+            "use a Kimodo-exported SOMA BVH or matching skeleton."
+        )
+    local_rot_mats, _ = skeleton.to_standard_tpose(local_rot_mats)
+    return complete_motion_dict(local_rot_mats, root_trans, skeleton, float(bvh_fps)), bvh_fps

kimodo/exports/motion_convert_lib.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Library API for converting between Kimodo NPZ, AMASS NPZ, SOMA BVH, and G1 MuJoCo CSV."""
+from __future__ import annotations
+import warnings
+import numpy as np
+from kimodo.exports.bvh import bvh_to_kimodo_motion, save_motion_bvh
+from kimodo.exports.motion_formats import (
+    infer_source_format_from_path,
+    infer_target_format_from_path,
+    resolve_source_fps,
+)
+from kimodo.exports.motion_io import (
+    load_amass_npz,
+    load_g1_csv,
+    load_kimodo_npz_as_torch,
+    save_kimodo_npz_at_target_fps,
+)
+from kimodo.exports.mujoco import MujocoQposConverter
+from kimodo.exports.smplx import AMASSConverter
+from kimodo.skeleton.registry import build_skeleton
+def convert_motion_files(
+    input_path: str,
+    output_path: str,
+    *,
+    from_fmt: str | None = None,
+    to_fmt: str | None = None,
+    source_fps: float | None = None,
+    z_up: bool = True,
+    mujoco_rest_zero: bool = False,
+) -> None:
+    """Convert a motion file between Kimodo-supported formats.
+    Supported pairs (hub-and-spoke through Kimodo NPZ):
+    - amass <-> kimodo
+    - soma-bvh <-> kimodo
+    - g1-csv <-> kimodo
+    Args:
+        input_path: Source file (``.npz``, ``.bvh``, or ``.csv``).
+        output_path: Destination file.
+        from_fmt: Source format; inferred from extension/contents when ``None``.
+        to_fmt: Target format; inferred from extension when ``None``.
+        source_fps: Source motion frame rate (Hz).  If provided, trusted as-is.
+            If ``None``, auto-detected from BVH ``Frame Time``, AMASS
+            ``mocap_frame_rate``, or default 30.
+        z_up: For AMASS conversions, apply the Z-up <-> Kimodo Y-up transform.
+        mujoco_rest_zero: For G1 CSV, joint angles relative to MuJoCo rest pose.
+    """
+    from_fmt = from_fmt or infer_source_format_from_path(input_path)
+    to_fmt = to_fmt or infer_target_format_from_path(output_path, from_fmt)
+    _validate_output_extension(to_fmt, output_path)
+    pair = (from_fmt, to_fmt)
+    if pair == ("amass", "kimodo"):
+        sk = build_skeleton(22)
+        effective_source = source_fps
+        if effective_source is None:
+            with np.load(input_path, allow_pickle=True) as z:
+                effective_source = float(z["mocap_frame_rate"]) if "mocap_frame_rate" in z.files else 30.0
+        motion = load_amass_npz(input_path, source_fps=effective_source, z_up=z_up)
+        save_kimodo_npz_at_target_fps(motion, sk, effective_source, output_path)
+        return
+    if pair == ("kimodo", "amass"):
+        data, J = load_kimodo_npz_as_torch(input_path, ensure_complete=False)
+        if J != 22:
+            raise ValueError(f"Kimodo→AMASS requires 22 joints (SMPL-X); this file has J={J}.")
+        sk = build_skeleton(22)
+        effective_source = resolve_source_fps(source_fps, "kimodo", input_path, None)
+        converter = AMASSConverter(fps=effective_source, skeleton=sk)
+        converter.convert_save_npz(data, output_path, z_up=z_up)
+        return
+    if pair == ("soma-bvh", "kimodo"):
+        sk = build_skeleton(77)
+        motion, bvh_fps = bvh_to_kimodo_motion(input_path, skeleton=sk)
+        effective_source = source_fps if source_fps is not None else bvh_fps
+        save_kimodo_npz_at_target_fps(motion, sk, effective_source, output_path)
+        return
+    if pair == ("kimodo", "soma-bvh"):
+        data, J = load_kimodo_npz_as_torch(input_path, ensure_complete=False)
+        if J == 30:
+            warnings.warn(
+                f"Input has 30 joints (somaskel30); expanding to somaskel77 for BVH export.",
+                UserWarning,
+                stacklevel=2,
+            )
+            sk = build_skeleton(30)
+        elif J == 77:
+            sk = build_skeleton(77)
+        else:
+            raise ValueError(f"Kimodo→BVH requires a SOMA skeleton (30 or 77 joints); this file has J={J}.")
+        effective_source = resolve_source_fps(source_fps, "kimodo", input_path, None)
+        save_motion_bvh(
+            output_path,
+            data["local_rot_mats"],
+            data["root_positions"],
+            skeleton=sk,
+            fps=effective_source,
+        )
+        return
+    if pair == ("g1-csv", "kimodo"):
+        sk = build_skeleton(34)
+        effective_source = resolve_source_fps(source_fps, "g1-csv", input_path, None)
+        motion = load_g1_csv(input_path, source_fps=effective_source, mujoco_rest_zero=mujoco_rest_zero)
+        save_kimodo_npz_at_target_fps(motion, sk, effective_source, output_path)
+        return
+    if pair == ("kimodo", "g1-csv"):
+        data, J = load_kimodo_npz_as_torch(input_path, ensure_complete=False)
+        if J != 34:
+            raise ValueError(f"Kimodo→CSV requires G1 with 34 joints; this file has J={J}.")
+        sk = build_skeleton(34)
+        effective_source = resolve_source_fps(source_fps, "kimodo", input_path, None)
+        converter = MujocoQposConverter(sk)
+        qpos = converter.dict_to_qpos(
+            {k: v for k, v in data.items() if k in ("local_rot_mats", "root_positions")},
+            device=str(sk.neutral_joints.device),
+            numpy=True,
+            mujoco_rest_zero=mujoco_rest_zero,
+        )
+        converter.save_csv(qpos, output_path)
+        return
+    raise ValueError(
+        f"Unsupported conversion {from_fmt!r} → {to_fmt!r}. "
+        "Supported: amass↔kimodo (SMPL-X NPZ), soma-bvh↔kimodo, g1-csv↔kimodo."
+    )
+def _validate_output_extension(to_fmt: str, output_path: str) -> None:
+    lower = output_path.lower()
+    if to_fmt == "kimodo" and lower.endswith(".npz"):
+        return
+    if to_fmt == "amass":
+        if not lower.endswith(".npz"):
+            raise ValueError("AMASS output must use a .npz path.")
+    elif to_fmt == "soma-bvh":
+        if not lower.endswith(".bvh"):
+            raise ValueError("SOMA BVH output must use a .bvh path.")
+    elif to_fmt == "g1-csv":
+        if not lower.endswith(".csv"):
+            raise ValueError("G1 CSV output must use a .csv path.")

kimodo/exports/motion_formats.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Infer motion file formats from paths and NPZ contents."""
+from __future__ import annotations
+import os
+from typing import Literal
+import numpy as np
+MotionSourceFormat = Literal["amass", "kimodo", "soma-bvh", "g1-csv"]
+MotionTargetFormat = Literal["amass", "kimodo", "soma-bvh", "g1-csv"]
+NpzMotionKind = Literal["amass", "kimodo"]
+def infer_npz_kind(path: str) -> NpzMotionKind:
+    """Classify a ``.npz`` as AMASS SMPL-X or Kimodo from required array keys."""
+    with np.load(path, allow_pickle=False) as z:
+        keys = set(z.files)
+    if "trans" in keys and "pose_body" in keys and "root_orient" in keys:
+        return "amass"
+    if "local_rot_mats" in keys or "posed_joints" in keys:
+        return "kimodo"
+    raise ValueError(
+        f"Unrecognized NPZ {path!r}: expected AMASS keys (trans, pose_body, ...) "
+        "or Kimodo keys (local_rot_mats, posed_joints, ...)."
+    )
+def infer_source_format_from_path(path: str) -> MotionSourceFormat:
+    """Infer converter input format from file extension and NPZ contents when needed."""
+    ext = os.path.splitext(path)[1].lower()
+    if ext == ".bvh":
+        return "soma-bvh"
+    if ext == ".csv":
+        return "g1-csv"
+    if ext == ".npz":
+        return infer_npz_kind(path)  # type: ignore[return-value]
+    raise ValueError(f"Cannot infer format from extension of {path!r}")
+def infer_target_format_from_path(path: str, from_fmt: MotionSourceFormat) -> MotionTargetFormat:
+    """Infer converter output format from destination path and source format."""
+    ext = os.path.splitext(path)[1].lower()
+    if ext == ".bvh":
+        return "soma-bvh"
+    if ext == ".csv":
+        return "g1-csv"
+    if ext == ".npz":
+        if from_fmt == "amass":
+            return "kimodo"
+        if from_fmt == "kimodo":
+            return "amass"
+        if from_fmt in ("g1-csv", "soma-bvh"):
+            return "kimodo"
+        raise ValueError(
+            "Ambiguous .npz output: set --to to 'kimodo' or 'amass' when the input format is not amass/kimodo."
+        )
+    raise ValueError(f"Cannot infer output format from extension of {path!r}")
+def resolve_source_fps(
+    fps: float | None,
+    from_kind: str,
+    input_path: str,
+    data: dict | None,
+) -> float:
+    """Resolve source frame rate (Hz) for conversion when ``fps`` is not overridden."""
+    if fps is not None:
+        return float(fps)
+    if data is not None and "mocap_frame_rate" in data:
+        return float(np.asarray(data["mocap_frame_rate"]).item())
+    if from_kind == "soma-bvh":
+        from kimodo.exports.bvh import read_bvh_frame_time_seconds
+        return 1.0 / read_bvh_frame_time_seconds(input_path)
+    return 30.0

kimodo/exports/motion_io.py ADDED Viewed

	@@ -0,0 +1,443 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Assemble Kimodo NPZ-compatible motion dicts from local rotations + root trajectory."""
+from __future__ import annotations
+import os
+import warnings
+from typing import Any, Dict, Tuple
+import numpy as np
+import torch
+from kimodo.geometry import matrix_to_quaternion, quaternion_to_matrix
+from kimodo.motion_rep.feature_utils import compute_heading_angle, compute_vel_xyz
+from kimodo.motion_rep.feet import foot_detect_from_pos_and_vel
+from kimodo.motion_rep.smooth_root import get_smooth_root_pos
+from kimodo.skeleton import SkeletonBase
+from kimodo.skeleton.registry import build_skeleton
+from kimodo.tools import to_numpy
+# Default motion rate for Kimodo NPZ produced by format conversion (matches common model FPS).
+KIMODO_CONVERT_TARGET_FPS = 30.0
+def _quaternion_slerp(q0: torch.Tensor, q1: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+    """Spherical linear interpolation; ``q0``, ``q1`` (..., 4) wxyz; ``t`` broadcastable to (...,
+    1)."""
+    if t.dim() < q0.dim():
+        t = t.unsqueeze(-1)
+    dot = (q0 * q1).sum(dim=-1, keepdim=True)
+    q1 = torch.where(dot < 0, -q1, q1)
+    dot = torch.abs(dot).clamp(-1.0, 1.0)
+    theta_0 = torch.acos(dot)
+    sin_theta = torch.sin(theta_0)
+    s0 = torch.sin((1.0 - t) * theta_0) / sin_theta.clamp(min=1e-8)
+    s1 = torch.sin(t * theta_0) / sin_theta.clamp(min=1e-8)
+    q = s0 * q0 + s1 * q1
+    return q / torch.linalg.norm(q, dim=-1, keepdim=True).clamp(min=1e-8)
+def resample_motion_dict_to_kimodo_fps(
+    motion_dict: Dict[str, torch.Tensor],
+    skeleton: SkeletonBase,
+    source_fps: float,
+    target_fps: float = KIMODO_CONVERT_TARGET_FPS,
+) -> Tuple[Dict[str, torch.Tensor], bool]:
+    """Resample a Kimodo motion dict to ``target_fps``.
+    When the fps ratio is close to an integer (e.g. 120 / 30 = 4), the faster
+    stepping method is used (take every *step*-th frame).  Otherwise falls back
+    to linear interp (root) + quaternion slerp (joints).
+    Re-runs :func:`complete_motion_dict` at the target rate so derived channels stay consistent.
+    Returns:
+        The motion dict and ``True`` if time resampling was applied, else ``False`` (already at
+        ``target_fps`` with matching frame count; only re-derived via FK).
+    """
+    local_rot_mats = motion_dict["local_rot_mats"]
+    root_positions = motion_dict["root_positions"]
+    local_rot_mats, root_positions = _coerce_time_local_root(local_rot_mats, root_positions)
+    t_in = int(local_rot_mats.shape[0])
+    if t_in < 1:
+        raise ValueError("Motion must have at least one frame.")
+    if source_fps <= 0:
+        raise ValueError(f"source_fps must be positive; got {source_fps}")
+    t_out = max(1, int(round(t_in * target_fps / source_fps)))
+    if t_out == t_in and abs(float(source_fps) - float(target_fps)) < 1e-3:
+        return complete_motion_dict(local_rot_mats, root_positions, skeleton, float(target_fps)), False
+    ratio = source_fps / target_fps
+    step = round(ratio)
+    if step >= 2 and abs(ratio - step) < 0.05:
+        local_out = local_rot_mats[::step]
+        root_out = root_positions[::step]
+    else:
+        device = local_rot_mats.device
+        dtype = local_rot_mats.dtype
+        u = torch.linspace(0, t_in - 1, t_out, device=device, dtype=dtype)
+        i0 = u.floor().long().clamp(0, t_in - 1)
+        i1 = torch.minimum(i0 + 1, torch.tensor(t_in - 1, device=device))
+        tau_1d = (u - i0.float()).unsqueeze(-1)
+        rp0 = root_positions[i0]
+        rp1 = root_positions[i1]
+        root_out = (1.0 - tau_1d) * rp0 + tau_1d * rp1
+        quats = matrix_to_quaternion(local_rot_mats)
+        q0 = quats[i0]
+        q1 = quats[i1]
+        tau_q = (u - i0.float()).view(t_out, 1, 1)
+        quat_out = _quaternion_slerp(q0, q1, tau_q)
+        local_out = quaternion_to_matrix(quat_out)
+    return complete_motion_dict(local_out, root_out, skeleton, float(target_fps)), True
+def warn_kimodo_npz_framerate(source_fps: float, t_before: int, t_after: int) -> None:
+    """Emit a warning after time resampling for Kimodo NPZ (linear root, quaternion slerp per
+    joint)."""
+    warnings.warn(
+        f"Resampled motion to {KIMODO_CONVERT_TARGET_FPS:.0f} Hz for Kimodo NPZ "
+        f"(source ~{source_fps:.4g} Hz, {t_before} input frames → {t_after} output frames). "
+        "Pass --source-fps if the detected source rate is wrong.",
+        UserWarning,
+        stacklevel=3,
+    )
+def _coerce_time_local_root(
+    local_rot_mats: torch.Tensor,
+    root_positions: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Normalize to shapes (T, J, 3, 3) and (T, 3)."""
+    if local_rot_mats.dim() == 5:
+        if int(local_rot_mats.shape[0]) != 1:
+            raise ValueError(f"local_rot_mats batch size must be 1 for single clip; got {local_rot_mats.shape[0]}")
+        local_rot_mats = local_rot_mats[0]
+    if root_positions.dim() == 3:
+        if int(root_positions.shape[0]) != 1:
+            raise ValueError(f"root_positions batch size must be 1; got {root_positions.shape[0]}")
+        root_positions = root_positions[0]
+    if local_rot_mats.dim() != 4:
+        raise ValueError(f"local_rot_mats must be (T,J,3,3); got {tuple(local_rot_mats.shape)}")
+    if root_positions.dim() != 2 or int(root_positions.shape[-1]) != 3:
+        raise ValueError(f"root_positions must be (T,3); got {tuple(root_positions.shape)}")
+    if int(local_rot_mats.shape[0]) != int(root_positions.shape[0]):
+        raise ValueError("local_rot_mats and root_positions must have the same number of frames")
+    return local_rot_mats, root_positions
+def complete_motion_dict(
+    local_rot_mats: torch.Tensor,
+    root_positions: torch.Tensor,
+    skeleton: SkeletonBase,
+    fps: float,
+) -> Dict[str, torch.Tensor]:
+    """Build the Kimodo motion output dict from local rotations and root positions.
+    Matches keys written by CLI generation (see docs/source/user_guide/output_formats.md).
+    Args:
+        local_rot_mats: (T, J, 3, 3) or (1, T, J, 3, 3) local rotation matrices.
+        root_positions: (T, 3) or (1, T, 3) root / pelvis world positions (meters).
+        skeleton: Skeleton instance (SOMA77, G1, SMPL-X, etc.).
+        fps: Sampling rate (Hz).
+    Returns:
+        Dict with tensors ``posed_joints``, ``global_rot_mats``, ``local_rot_mats``,
+        ``foot_contacts``, ``smooth_root_pos``, ``root_positions``, ``global_root_heading``.
+    """
+    device = local_rot_mats.device
+    dtype = local_rot_mats.dtype
+    local_rot_mats, root_positions = _coerce_time_local_root(
+        local_rot_mats.to(device=device, dtype=dtype),
+        root_positions.to(device=device, dtype=dtype),
+    )
+    global_rot_mats, posed_joints, _ = skeleton.fk(local_rot_mats, root_positions)
+    smooth_root_pos = get_smooth_root_pos(root_positions.unsqueeze(0)).squeeze(0)
+    lengths = torch.tensor([posed_joints.shape[0]], device=device)
+    velocities = compute_vel_xyz(posed_joints.unsqueeze(0), fps, lengths=lengths).squeeze(0)
+    heading_angle = compute_heading_angle(posed_joints.unsqueeze(0), skeleton).squeeze(0)
+    global_root_heading = torch.stack([torch.cos(heading_angle), torch.sin(heading_angle)], dim=-1)
+    foot_contacts = foot_detect_from_pos_and_vel(
+        posed_joints.unsqueeze(0),
+        velocities.unsqueeze(0),
+        skeleton,
+        0.15,
+        0.10,
+    ).squeeze(0)
+    return {
+        "posed_joints": posed_joints,
+        "global_rot_mats": global_rot_mats,
+        "local_rot_mats": local_rot_mats,
+        "foot_contacts": foot_contacts,
+        "smooth_root_pos": smooth_root_pos,
+        "root_positions": root_positions,
+        "global_root_heading": global_root_heading,
+    }
+def motion_dict_to_numpy(d: Dict[str, Any]) -> Dict[str, np.ndarray]:
+    """Convert motion dict values to numpy arrays for ``np.savez``."""
+    out: Dict[str, np.ndarray] = {}
+    for k, v in d.items():
+        if hasattr(v, "detach"):
+            out[k] = to_numpy(v)
+        elif isinstance(v, np.ndarray):
+            out[k] = v
+        else:
+            out[k] = np.asarray(v)
+    return out
+def save_kimodo_npz(path: str, motion_dict: Dict[str, Any]) -> None:
+    """Save a Kimodo-compatible motion dict to ``.npz`` (numpy arrays)."""
+    np.savez(path, **motion_dict_to_numpy(motion_dict))
+def load_kimodo_npz(path: str) -> Dict[str, np.ndarray]:
+    """Load arrays from a Kimodo ``.npz`` file."""
+    with np.load(path, allow_pickle=False) as data:
+        return {k: np.asarray(data[k]) for k in data.files}
+def load_g1_csv(
+    path: str,
+    source_fps: float = KIMODO_CONVERT_TARGET_FPS,
+    *,
+    mujoco_rest_zero: bool = False,
+) -> Dict[str, torch.Tensor]:
+    """Load a G1 MuJoCo ``qpos`` CSV (``(T, 36)``) into a Kimodo motion dict.
+    Args:
+        path: CSV path (comma-separated, no header).
+        source_fps: Source frame rate (Hz) of the CSV data.
+        mujoco_rest_zero: Must match how the CSV was written (see :class:`MujocoQposConverter`).
+    """
+    from kimodo.exports.mujoco import MujocoQposConverter
+    qpos = np.loadtxt(path, delimiter=",")
+    if qpos.ndim != 2 or qpos.shape[-1] != 36:
+        raise ValueError(f"Expected G1 CSV with shape (T, 36); got {qpos.shape}")
+    sk = build_skeleton(34)
+    converter = MujocoQposConverter(sk)
+    return converter.qpos_to_motion_dict(qpos, float(source_fps), mujoco_rest_zero=mujoco_rest_zero)
+def load_amass_npz(
+    path: str,
+    source_fps: float | None = None,
+    *,
+    z_up: bool = True,
+) -> Dict[str, torch.Tensor]:
+    """Load an AMASS-style SMPL-X ``.npz`` into a Kimodo motion dict (22 joints).
+    Args:
+        path: NPZ with ``trans``, ``root_orient``, ``pose_body``, etc.
+        source_fps: Source frame rate (Hz); if ``None``, uses ``mocap_frame_rate``
+            from the file when present, else 30 Hz.
+        z_up: If ``True``, apply AMASS Z-up to Kimodo Y-up transform (same as CLI).
+    """
+    from kimodo.exports.smplx import amass_npz_to_kimodo_motion
+    sk = build_skeleton(22)
+    return amass_npz_to_kimodo_motion(path, sk, source_fps=source_fps, z_up=z_up)
+def load_kimodo_npz_as_torch(
+    path: str,
+    source_fps: float = KIMODO_CONVERT_TARGET_FPS,
+    *,
+    ensure_complete: bool = True,
+) -> tuple[Dict[str, torch.Tensor], int]:
+    """Load a Kimodo NPZ and return all arrays as torch tensors on the skeleton device.
+    Args:
+        path: Kimodo NPZ file path.
+        source_fps: Source frame rate (Hz) used for derived channels when
+            ``ensure_complete=True``.
+        ensure_complete: If ``True`` and the NPZ lacks derived channels
+            (``posed_joints``, ``global_rot_mats``, …), run :func:`complete_motion_dict`
+            to fill them from ``local_rot_mats`` + ``root_positions``.
+            If ``False``, load all arrays verbatim (requires ``local_rot_mats``).
+    Returns:
+        ``(tensor_dict, num_joints)``
+    """
+    raw = load_kimodo_npz(path)
+    if "local_rot_mats" in raw:
+        j = int(raw["local_rot_mats"].shape[1])
+    elif "posed_joints" in raw:
+        j = int(raw["posed_joints"].shape[1])
+    else:
+        raise ValueError("Kimodo NPZ must contain 'local_rot_mats' or 'posed_joints'.")
+    sk = build_skeleton(j)
+    device = sk.neutral_joints.device
+    dtype = torch.float32
+    if not ensure_complete:
+        if "local_rot_mats" not in raw:
+            raise ValueError("Kimodo NPZ must contain 'local_rot_mats' (and typically 'root_positions').")
+        out: Dict[str, torch.Tensor] = {}
+        for k, v in raw.items():
+            out[k] = torch.from_numpy(np.asarray(v)).to(device=device, dtype=dtype)
+        return out, j
+    if "posed_joints" in raw and "global_rot_mats" in raw:
+        out = {}
+        for k, v in raw.items():
+            out[k] = torch.from_numpy(np.asarray(v)).to(device=device, dtype=dtype)
+        return out, j
+    if "local_rot_mats" not in raw or "root_positions" not in raw:
+        raise ValueError("Kimodo NPZ must contain posed_joints+global_rot_mats, or local_rot_mats+root_positions.")
+    local = torch.from_numpy(np.asarray(raw["local_rot_mats"])).to(device=device, dtype=dtype)
+    root = torch.from_numpy(np.asarray(raw["root_positions"])).to(device=device, dtype=dtype)
+    return complete_motion_dict(local, root, sk, float(source_fps)), j
+def save_kimodo_npz_at_target_fps(
+    motion: Dict[str, torch.Tensor],
+    skeleton: SkeletonBase,
+    source_fps: float,
+    output_path: str,
+    target_fps: float = KIMODO_CONVERT_TARGET_FPS,
+) -> None:
+    """Resample a motion dict to ``target_fps`` when needed, then save Kimodo NPZ."""
+    t_before = int(motion["local_rot_mats"].shape[0])
+    motion, did_resample = resample_motion_dict_to_kimodo_fps(motion, skeleton, source_fps, target_fps)
+    t_after = int(motion["local_rot_mats"].shape[0])
+    if did_resample:
+        warn_kimodo_npz_framerate(source_fps, t_before, t_after)
+    save_kimodo_npz(output_path, motion)
+def kimodo_npz_to_bytes(motion_dict: Dict[str, Any]) -> bytes:
+    """Serialize a Kimodo motion dict to in-memory NPZ bytes."""
+    import io
+    buf = io.BytesIO()
+    np.savez(buf, **motion_dict_to_numpy(motion_dict))
+    return buf.getvalue()
+def g1_csv_to_bytes(motion_dict: Dict[str, Any], skeleton: SkeletonBase, device: Any) -> bytes:
+    """Convert a motion dict to G1 MuJoCo CSV bytes via :class:`MujocoQposConverter`."""
+    import io
+    from kimodo.exports.mujoco import MujocoQposConverter
+    converter = MujocoQposConverter(skeleton)
+    qpos = converter.dict_to_qpos(
+        {k: v for k, v in motion_dict.items() if k in ("local_rot_mats", "root_positions")},
+        device,
+        numpy=True,
+    )
+    buf = io.StringIO()
+    np.savetxt(buf, qpos, delimiter=",")
+    return buf.getvalue().encode("utf-8")
+def amass_npz_to_bytes(motion_dict: Dict[str, Any], skeleton: SkeletonBase, fps: float) -> bytes:
+    """Convert a motion dict to AMASS NPZ bytes via :class:`AMASSConverter`."""
+    import io
+    from kimodo.exports.smplx import AMASSConverter
+    converter = AMASSConverter(skeleton=skeleton, fps=fps)
+    buf = io.BytesIO()
+    converter.convert_save_npz(
+        {k: v for k, v in motion_dict.items() if k in ("local_rot_mats", "root_positions")},
+        buf,
+    )
+    return buf.getvalue()
+def _read_amass_source_fps(path: str) -> float:
+    """Read the source frame rate from an AMASS NPZ, defaulting to 30 Hz."""
+    with np.load(path, allow_pickle=True) as z:
+        if "mocap_frame_rate" in z.files:
+            return float(z["mocap_frame_rate"])
+    return 30.0
+def load_motion_file(
+    path: str,
+    source_fps: float | None = None,
+    target_fps: float | None = None,
+    *,
+    z_up: bool = True,
+    mujoco_rest_zero: bool = False,
+) -> tuple[Dict[str, torch.Tensor], int]:
+    """Load a motion file and return a Kimodo motion dict plus joint count.
+    Supports SOMA BVH (``.bvh``), G1 MuJoCo CSV (``.csv``), Kimodo NPZ, and AMASS SMPL-X NPZ
+    (``.npz``).
+    The motion is loaded at its native (or overridden) source rate, then
+    resampled to ``target_fps`` when they differ.
+    Args:
+        path: Path to ``.bvh``, ``.csv``, or ``.npz``.
+        source_fps: Source frame rate (Hz).  If provided, trusted as-is.
+            If ``None``, auto-detected per format: BVH ``Frame Time`` header,
+            AMASS ``mocap_frame_rate``, or :data:`KIMODO_CONVERT_TARGET_FPS`
+            (30 Hz) for CSV / Kimodo NPZ.
+        target_fps: Desired output frame rate (Hz).  Defaults to
+            :data:`KIMODO_CONVERT_TARGET_FPS` (30 Hz).  The motion is
+            resampled when ``source_fps`` and ``target_fps`` differ.
+        z_up: AMASS NPZ only; passed to :func:`load_amass_npz`.
+        mujoco_rest_zero: G1 CSV only; passed to :func:`load_g1_csv`.
+    Returns:
+        ``(motion_dict, num_joints)`` with the same keys as :func:`complete_motion_dict`.
+    """
+    from kimodo.exports.motion_formats import infer_npz_kind
+    if target_fps is None:
+        target_fps = KIMODO_CONVERT_TARGET_FPS
+    ext = os.path.splitext(path)[1].lower()
+    if ext == ".bvh":
+        from kimodo.exports.bvh import bvh_to_kimodo_motion
+        motion_dict, bvh_fps = bvh_to_kimodo_motion(path)
+        effective_source = source_fps if source_fps is not None else bvh_fps
+        num_joints = int(motion_dict["local_rot_mats"].shape[1])
+    elif ext == ".csv":
+        effective_source = source_fps if source_fps is not None else KIMODO_CONVERT_TARGET_FPS
+        motion_dict = load_g1_csv(path, source_fps=effective_source, mujoco_rest_zero=mujoco_rest_zero)
+        num_joints = 34
+    elif ext == ".npz":
+        kind = infer_npz_kind(path)
+        if kind == "amass":
+            effective_source = source_fps if source_fps is not None else _read_amass_source_fps(path)
+            motion_dict = load_amass_npz(path, source_fps=effective_source, z_up=z_up)
+            num_joints = 22
+        else:
+            effective_source = source_fps if source_fps is not None else KIMODO_CONVERT_TARGET_FPS
+            motion_dict, num_joints = load_kimodo_npz_as_torch(path, source_fps=effective_source)
+    else:
+        raise ValueError(f"Unsupported motion file {path!r}; expected .bvh, .csv, or .npz")
+    if abs(effective_source - target_fps) > 0.5:
+        sk = build_skeleton(num_joints)
+        motion_dict, did_resample = resample_motion_dict_to_kimodo_fps(motion_dict, sk, effective_source, target_fps)
+        if did_resample:
+            t_out = int(motion_dict["local_rot_mats"].shape[0])
+            warnings.warn(
+                f"Resampled motion from {effective_source:.4g} Hz to " f"{target_fps:.0f} Hz ({t_out} frames).",
+                UserWarning,
+                stacklevel=2,
+            )
+    return motion_dict, num_joints

kimodo/exports/mujoco.py ADDED Viewed

	@@ -0,0 +1,588 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Convert kimodo motion (y-up, z-forward) to MuJoCo qpos (z-up, x-forward) for G1 skeleton."""
+import os
+import xml.etree.ElementTree as ET
+from typing import Optional
+import numpy as np
+import torch
+from scipy.spatial.transform import Rotation
+from kimodo.assets import skeleton_asset_path
+from kimodo.geometry import (
+    axis_angle_to_matrix,
+    matrix_to_axis_angle,
+    matrix_to_quaternion,
+    quaternion_to_matrix,
+)
+from kimodo.skeleton import G1Skeleton34, SkeletonBase, global_rots_to_local_rots
+from kimodo.tools import ensure_batched, to_numpy, to_torch
+# Cache so that the same (skeleton, xml_path) returns the same converter instance.
+_converter_cache: dict[tuple[int, str], "MujocoQposConverter"] = {}
+class MujocoQposConverter:
+    """Fast batch converter from our dictionary format to mujoco qpos with precomputed transforms.
+    In mujoco, the coordination is z up and x forward, right handed.
+    Features (30 joints):
+    - root (pelvis, 7 = translation + rotation) + 29 dof joints (29)
+    In kimodo, the coordinate system is y up and z forward, right handed.
+    Features (34 joints):
+    - root (pelvis) + (34 - 1) joints; among these joints, 4 are end-effector joints added by kimodo.
+    Cached by (input_skeleton id, xml_path); repeated calls with the same args return the same instance.
+    """
+    def __new__(
+        cls,
+        input_skeleton: SkeletonBase,
+        xml_path: str = str(skeleton_asset_path("g1skel34", "xml", "g1.xml")),
+    ):
+        key = (id(input_skeleton), xml_path)
+        if key not in _converter_cache:
+            inst = object.__new__(cls)
+            _converter_cache[key] = inst
+        return _converter_cache[key]
+    def __init__(
+        self,
+        input_skeleton: SkeletonBase,
+        xml_path: str = str(skeleton_asset_path("g1skel34", "xml", "g1.xml")),
+    ):
+        """Initialize converter with precomputed transforms.
+        Args:
+            xml_path: Path to the mujoco XML file containing joint definitions
+        """
+        if getattr(self, "_initialized", False):
+            return
+        self.xml_path = xml_path
+        self.skeleton = input_skeleton
+        self._prepare_transforms()
+        self._subtree_joints = {}
+        self._initialized = True
+    def _prepare_transforms(self):
+        """Precompute all necessary transforms for efficient batch processing."""
+        # Define coordinate transformations between mujoco and kimodo space
+        # 1) R_zup_to_yup: rotation around x-axis by -90 degrees
+        # 2) x_forward_to_y_forward: rotation around z-axis by -90 degrees
+        # Combined transformation matrix: mujoco_to_kimodo = R_zup_to_yup * x_forward_to_y_forward
+        self.mujoco_to_kimodo_matrix = torch.tensor(
+            [[0.0, 1.0, 0.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0]], dtype=torch.float32
+        )
+        self.kimodo_to_mujoco_matrix = self.mujoco_to_kimodo_matrix.T  # Inverse transformation: kimodo_to_mujoco
+        # Parse XML once and extract joint information
+        tree = ET.parse(self.xml_path)
+        root = tree.getroot()
+        xml_classes = [x for x in tree.findall(".//default") if "class" in x.attrib]
+        joint_axes = dict()
+        class_ranges: dict[str, tuple[float, float]] = {}
+        for xml_class in xml_classes:
+            j = xml_class.findall("joint")
+            if j:
+                joint_axes[xml_class.get("class")] = j[0].get("axis")
+                range_str = j[0].get("range")
+                if range_str:
+                    range_vals = [float(x) for x in range_str.split()]
+                    if len(range_vals) == 2:
+                        class_ranges[xml_class.get("class")] = (
+                            range_vals[0],
+                            range_vals[1],
+                        )
+        mujoco_hinge_joints = root.find("worldbody").findall(".//joint")  # skip the base joint
+        self._mujoco_joint_axis_values_kimodo_space = torch.zeros(
+            (len(mujoco_hinge_joints), 3), dtype=torch.float32
+        )  # mujoco order but kimodo space
+        self._mujoco_joint_axis_values_mujoco_space = torch.zeros(
+            (len(mujoco_hinge_joints), 3), dtype=torch.float32
+        )  # mujoco order but mujoco space
+        # for the below indices, mujoco_indices_to_kimodo_indices does not include mujoco root (30 - 1 = 29 elements),
+        # while kimodo_indices_to_mujoco_indices inclues the kimodo root (32 elements).
+        self._mujoco_indices_to_kimodo_indices = torch.zeros((len(mujoco_hinge_joints),), dtype=torch.int32)
+        self._kimodo_indices_to_mujoco_indices = (
+            torch.ones((self.skeleton.nbjoints,), dtype=torch.int32) * -1
+        )  # -1 means not in the csv skeleton
+        self._nb_joints_mujoco = len(mujoco_hinge_joints) + 1
+        self._nb_joints_kimodo = self.skeleton.nbjoints
+        self._mujoco_joint_including_root_parent_list = torch.full(
+            (len(mujoco_hinge_joints) + 1,), -1, dtype=torch.int32
+        )
+        self._mujoco_joint_including_root_list = ["pelvis_skel"]
+        for joint_id_in_csv, joint in enumerate(mujoco_hinge_joints):
+            joint_name_in_skeleton = joint.get("name").replace("_joint", "_skel")
+            joint_parent_name_in_skeleton = self.skeleton.bone_parents[joint_name_in_skeleton]
+            self._mujoco_joint_including_root_list.append(joint_name_in_skeleton)
+            self._mujoco_joint_including_root_parent_list[joint_id_in_csv + 1] = (
+                self._mujoco_joint_including_root_list.index(joint_parent_name_in_skeleton)
+            )
+            joint_idx_in_kimodo_skeleton = self.skeleton.bone_order_names.index(joint_name_in_skeleton)
+            axis_values = [float(x) for x in (joint.get("axis") or joint_axes[joint.get("class")]).split(" ")]
+            # the mapped axis in kimodo skeleton space is calculated as bones_axis = mujoco_to_kimodo.apply(axis_values)
+            # [1, 0, 0] -> [0, 0, 1]; [0, 1, 0] -> [1, 0, 0]; [0, 0, 1] -> [0, 1, 0]
+            mujoco_joint_axis_mapping_kimodo_space = [
+                torch.tensor([0, 0, 1]),
+                torch.tensor([1, 0, 0]),
+                torch.tensor([0, 1, 0]),
+            ][np.argmax(axis_values)]
+            self._mujoco_joint_axis_values_kimodo_space[joint_id_in_csv] = mujoco_joint_axis_mapping_kimodo_space
+            self._mujoco_joint_axis_values_mujoco_space[joint_id_in_csv] = torch.tensor(axis_values)
+            self._mujoco_indices_to_kimodo_indices[joint_id_in_csv] = joint_idx_in_kimodo_skeleton
+            self._kimodo_indices_to_mujoco_indices[joint_idx_in_kimodo_skeleton] = (
+                joint_id_in_csv + 1
+            )  # +1 for the root
+        self._kimodo_indices_to_mujoco_indices[0] = 0  # the root joint mapping
+        # Joint limits (min, max) in radians for each mujoco hinge, for clamping
+        self._joint_limits_min = torch.full((len(mujoco_hinge_joints),), float("-inf"), dtype=torch.float32)
+        self._joint_limits_max = torch.full((len(mujoco_hinge_joints),), float("inf"), dtype=torch.float32)
+        for joint_id_in_csv, joint in enumerate(mujoco_hinge_joints):
+            range_vals = None
+            if joint.get("range"):
+                range_vals = [float(x) for x in joint.get("range").split()]
+            elif joint.get("class") and joint.get("class") in class_ranges:
+                lo, hi = class_ranges[joint.get("class")]
+                range_vals = [lo, hi]
+            if range_vals is not None and len(range_vals) == 2:
+                self._joint_limits_min[joint_id_in_csv] = range_vals[0]
+                self._joint_limits_max[joint_id_in_csv] = range_vals[1]
+        # load the offset matrices from the xml
+        R_zup_to_yup = Rotation.from_euler("x", -90, degrees=True)
+        x_forward_to_y_forward = Rotation.from_euler("z", -90, degrees=True)
+        mujoco_to_kimodo = R_zup_to_yup * x_forward_to_y_forward
+        self._rot_offsets_q2t = torch.zeros(len(self._kimodo_indices_to_mujoco_indices), 3, 3, dtype=torch.float32)
+        self._rot_offsets_q2t[...] = torch.eye(3)[None]
+        self._rot_offsets_f2q = torch.zeros(len(self._kimodo_indices_to_mujoco_indices), 3, 3, dtype=torch.float32)
+        self._rot_offsets_f2q[...] = torch.eye(3)[None]
+        parent_map = {child: parent for parent in root.iter() for child in parent}
+        for i, joint in enumerate(mujoco_hinge_joints):
+            body = parent_map[joint]
+            if "quat" in body.attrib:
+                rot = Rotation.from_quat(
+                    [float(x) for x in body.get("quat").strip().split(" ")],
+                    scalar_first=True,
+                )
+                idx = self._mujoco_indices_to_kimodo_indices[i]
+                self._rot_offsets_q2t[idx] = torch.from_numpy(rot.as_matrix())
+                rot = mujoco_to_kimodo * rot * mujoco_to_kimodo.inv()
+                self._rot_offsets_f2q[idx] = torch.from_numpy(rot.as_matrix().T)
+        # Hinge axis in f2q space so extraction uses the same frame as joint_rot_f2q.
+        # Then extract(offset) gives the angle s.t. axis_angle(angle * axis_f2q) = offset, and
+        # reconstruction R_local = offset.T @ axis_angle(angle * axis_f2q) = I when input is identity.
+        axis_kimodo = self._mujoco_joint_axis_values_kimodo_space
+        self._mujoco_joint_axis_values_f2q_space = torch.zeros_like(axis_kimodo)
+        for i in range(len(mujoco_hinge_joints)):
+            j = self._mujoco_indices_to_kimodo_indices[i].item()
+            axis_f2q = torch.mv(self._rot_offsets_f2q[j], axis_kimodo[i])
+            n = axis_f2q.norm()
+            if n > 1e-8:
+                axis_f2q = axis_f2q / n
+            self._mujoco_joint_axis_values_f2q_space[i] = axis_f2q
+        # Rest-pose DOFs: angle we extract when R_local = I (t-pose). MuJoCo limits are
+        # relative to joint zero (rest pose), so we must clamp in MuJoCo space: convert
+        # joint_dofs to mujoco_angle = joint_dofs - rest_dofs, clamp, then back.
+        rest_rot_f2q = self._rot_offsets_f2q[self._mujoco_indices_to_kimodo_indices]
+        rest_rot_f2q = rest_rot_f2q.unsqueeze(0).unsqueeze(0)
+        self._rest_dofs = self._local_rots_f2q_to_joint_dofs(rest_rot_f2q).squeeze(0).squeeze(0)
+        # Axis-angle rest DOFs: angle s.t. axis_angle(angle * axis_f2q) = offset. Used in
+        # project_to_real_robot_rotations so extract+reconstruct round-trip and t-pose is preserved.
+        rest_rot_f2q_flat = self._rot_offsets_f2q[self._mujoco_indices_to_kimodo_indices]
+        full_aa = matrix_to_axis_angle(rest_rot_f2q_flat)
+        self._rest_dofs_axis_angle = (full_aa * self._mujoco_joint_axis_values_f2q_space).sum(dim=-1)
+    def dict_to_qpos(
+        self,
+        output: dict,
+        device: Optional[str] = None,
+        root_quat_w_first: bool = True,
+        numpy: bool = True,
+        mujoco_rest_zero: bool = False,
+    ):
+        """Convert kimodo output dict to mujoco qpos format.
+        Args:
+            output: dict with keys "local_rot_mats" and "root_positions".
+            device: device to use for the output.
+            root_quat_w_first: If True, quaternion in qpos is (w,x,y,z).
+            numpy: If True, convert the output to numpy array.
+            mujoco_rest_zero: If True, joint angles are written so that kimodo rest (t-pose)
+                maps to q=0 in MuJoCo. If False, write raw joint_dofs.
+        Returns:
+            qpos: (B, T, 7+J) mujoco qpos format.
+        """
+        local_rot_mats = to_torch(output["local_rot_mats"], device)
+        root_positions = to_torch(output["root_positions"], device)
+        qpos = self.to_qpos(
+            local_rot_mats,
+            root_positions,
+            root_quat_w_first=root_quat_w_first,
+            mujoco_rest_zero=mujoco_rest_zero,
+        )
+        if numpy:
+            qpos = to_numpy(qpos)
+        return qpos
+    def qpos_to_motion_dict(
+        self,
+        qpos: torch.Tensor | np.ndarray,
+        source_fps: float,
+        *,
+        root_quat_w_first: bool = True,
+        mujoco_rest_zero: bool = False,
+    ):
+        """Inverse of :meth:`to_qpos` / :meth:`dict_to_qpos` for MuJoCo CSV ``(T, 36)`` rows.
+        Args:
+            qpos: Shape ``(T, 36)`` or ``(1, T, 36)`` (root xyz, root quat wxyz, 29 joint angles).
+            source_fps: Source frame rate (Hz) of the qpos data.
+            root_quat_w_first: Must match how the CSV was written (default ``True``).
+            mujoco_rest_zero: Must match :meth:`dict_to_qpos` / :meth:`to_qpos`.
+        Returns:
+            Kimodo motion dict (see :func:`kimodo.exports.motion_io.complete_motion_dict`).
+        """
+        from kimodo.exports.motion_io import complete_motion_dict
+        qpos = to_torch(qpos, None)
+        if qpos.dim() == 2:
+            qpos = qpos.unsqueeze(0)
+        device = qpos.device
+        dtype = qpos.dtype
+        batch_size, num_frames, ncols = qpos.shape
+        if ncols != 36:
+            raise ValueError(f"Expected qpos last dim 36; got {ncols}")
+        kimodo_to_mujoco_matrix = self.kimodo_to_mujoco_matrix.to(device=device, dtype=dtype)
+        mujoco_to_kimodo_matrix = kimodo_to_mujoco_matrix.T
+        root_mujoco = qpos[..., :3]
+        root_positions = torch.matmul(mujoco_to_kimodo_matrix[None, None, ...], root_mujoco[..., None]).squeeze(-1)
+        quat = qpos[..., 3:7]
+        if root_quat_w_first:
+            root_rot_mujoco = quaternion_to_matrix(quat)
+        else:
+            quat_wxyz = quat[..., [3, 0, 1, 2]]
+            root_rot_mujoco = quaternion_to_matrix(quat_wxyz)
+        O0 = self._rot_offsets_f2q[0].to(device=device, dtype=dtype)
+        # root_rot_mujoco is (..., 3, 3) after optional batch unsqueeze (e.g. (1, T, 3, 3)).
+        # Use ``...il`` so ``k`` sums with ``kl``; ``...ik`` incorrectly keeps ``k`` in the output.
+        R_f2q_root = torch.einsum(
+            "ij,...jk,kl->...il",
+            mujoco_to_kimodo_matrix,
+            root_rot_mujoco,
+            kimodo_to_mujoco_matrix,
+        )
+        R_kimodo_root = torch.einsum("ij,...jk->...ik", O0.T, R_f2q_root)
+        joint_dofs = qpos[..., 7:]
+        if mujoco_rest_zero:
+            rest_dofs = self._rest_dofs.to(device=device, dtype=dtype)
+            angles = joint_dofs + rest_dofs[None, None, :]
+            use_relative = True
+        else:
+            angles = joint_dofs
+            use_relative = False
+        nb_joints = self.skeleton.nbjoints
+        template = torch.eye(3, device=device, dtype=dtype).expand(batch_size, num_frames, nb_joints, 3, 3).contiguous()
+        template[:, :, 0] = R_kimodo_root
+        local_rot_mats = self._joint_dofs_to_local_rot_mats(
+            angles,
+            template,
+            device,
+            dtype,
+            use_relative=use_relative,
+        )
+        if batch_size != 1:
+            raise ValueError(f"Only a single clip is supported; got batch_size={batch_size}")
+        return complete_motion_dict(local_rot_mats[0], root_positions[0], self.skeleton, source_fps)
+    def save_csv(self, qpos: torch.Tensor | np.ndarray, csv_path):
+        # comment this
+        qpos = to_numpy(qpos)
+        shape = qpos.shape
+        if len(shape) == 2:
+            # only one motion: save it
+            np.savetxt(csv_path, qpos, delimiter=",")
+        if len(shape) == 3:
+            # batch of motions
+            if shape[0] == 1:
+                # if only one motion, just save it
+                np.savetxt(csv_path, qpos[0], delimiter=",")
+            else:
+                csv_path_base, ext = os.path.splitext(csv_path)
+                for i in range(shape[0]):
+                    self.save_csv(qpos[i], csv_path_base + "_" + str(i).zfill(2) + ext)
+    def _local_rots_to_joint_dofs(
+        self,
+        local_rot_mats: torch.Tensor,
+        axis_vals: torch.Tensor,
+    ) -> torch.Tensor:
+        """Extract per-joint single-DoF angles (radians) via Euler projection (for to_qpos/f2q)."""
+        x_joint_dof = torch.atan2(local_rot_mats[..., 2, 1], local_rot_mats[..., 2, 2])
+        y_joint_dof = torch.atan2(local_rot_mats[..., 0, 2], local_rot_mats[..., 0, 0])
+        z_joint_dof = torch.atan2(local_rot_mats[..., 1, 0], local_rot_mats[..., 1, 1])
+        xyz_joint_dofs = torch.stack([x_joint_dof, y_joint_dof, z_joint_dof], dim=-1)
+        axis_vals = axis_vals.to(device=local_rot_mats.device, dtype=local_rot_mats.dtype)
+        joint_dofs = (xyz_joint_dofs * axis_vals[None, None, :, :]).sum(dim=-1)
+        return joint_dofs
+    def _local_rots_to_joint_dofs_axis_angle(
+        self,
+        local_rot_mats: torch.Tensor,
+        axis_vals: torch.Tensor,
+    ) -> torch.Tensor:
+        """Extract per-joint single-DoF angles (radians) via axis-angle; round-trips with
+        axis_angle_to_matrix.
+        Args:
+            local_rot_mats: (..., num_hinges, 3, 3) in same frame as axis_vals.
+            axis_vals: (num_hinges, 3) unit axis per hinge.
+        Returns:
+            joint_dofs: (..., num_hinges) signed angle = dot(axis_angle(R), axis).
+        """
+        axis_vals = axis_vals.to(device=local_rot_mats.device, dtype=local_rot_mats.dtype)
+        full_aa = matrix_to_axis_angle(local_rot_mats)
+        joint_dofs = (full_aa * axis_vals).sum(dim=-1)
+        return joint_dofs
+    def _local_rots_f2q_to_joint_dofs(self, local_rot_mats_f2q: torch.Tensor) -> torch.Tensor:
+        """Extract per-joint single-DoF angles from local rotations in f2q space (for to_qpos)."""
+        axis_vals = self._mujoco_joint_axis_values_f2q_space
+        return self._local_rots_to_joint_dofs(local_rot_mats_f2q, axis_vals)
+    def _clamp_to_limits(self, joint_dofs: torch.Tensor) -> torch.Tensor:
+        """Clamp joint angles to XML limits (radians).
+        Angles are in kimodo convention (0 = rest).
+        """
+        device = joint_dofs.device
+        lo = self._joint_limits_min.to(device=device, dtype=joint_dofs.dtype)
+        hi = self._joint_limits_max.to(device=device, dtype=joint_dofs.dtype)
+        return torch.clamp(joint_dofs, lo[None, None, :], hi[None, None, :])
+    def _clamp_joint_dofs(self, joint_dofs: torch.Tensor, rest_dofs: torch.Tensor) -> torch.Tensor:
+        """Clamp joint angles to MuJoCo limits (radians), with rest_dofs conversion."""
+        device = joint_dofs.device
+        rest_dofs = rest_dofs.to(device=device, dtype=joint_dofs.dtype)
+        mujoco_dofs = joint_dofs - rest_dofs[None, None, :]
+        lo = self._joint_limits_min.to(device=device, dtype=joint_dofs.dtype)
+        hi = self._joint_limits_max.to(device=device, dtype=joint_dofs.dtype)
+        mujoco_dofs = torch.clamp(mujoco_dofs, lo[None, None, :], hi[None, None, :])
+        return mujoco_dofs + rest_dofs[None, None, :]
+    def _joint_dofs_to_local_rot_mats(
+        self,
+        joint_dofs: torch.Tensor,
+        original_local_rot_mats: torch.Tensor,
+        device: torch.device,
+        dtype: torch.dtype,
+        use_relative: bool = False,
+    ) -> torch.Tensor:
+        """Reconstruct full local rotation matrices from 1-DoF angles."""
+        out = original_local_rot_mats.clone()
+        axis_kimodo = self._mujoco_joint_axis_values_kimodo_space.to(device=device, dtype=dtype)
+        for i in range(joint_dofs.shape[-1]):
+            j = self._mujoco_indices_to_kimodo_indices[i].item()
+            angle = joint_dofs[..., i]
+            axis = axis_kimodo[i]
+            if use_relative:
+                axis_angle = angle[..., None] * axis[None, None, :]
+                R_local = axis_angle_to_matrix(axis_angle)
+            else:
+                rot_offsets_f2q = self._rot_offsets_f2q.to(device=device, dtype=dtype)
+                axis_in_f2q = torch.mv(rot_offsets_f2q[j], axis)
+                axis_angle = angle[..., None] * axis_in_f2q[None, None, :]
+                R_f2q = axis_angle_to_matrix(axis_angle)
+                R_local = torch.einsum("ij,btjk->btik", rot_offsets_f2q[j].T, R_f2q)
+            out[:, :, j, :, :] = R_local
+        return out
+    @ensure_batched(local_rot_mats=5, root_positions=3, lengths=1)
+    def project_to_real_robot_rotations(
+        self,
+        local_rot_mats: torch.Tensor,
+        root_positions: torch.Tensor,
+        clamp_to_limits: bool = True,
+        mujoco_rest_zero: bool = False,
+    ) -> dict:
+        """Project full 3D local rotations to G1 real robot DoF and back to 3D for viz.
+        Joint angles are extracted along each hinge axis, optionally clamped to XML limits, then
+        reconstructed to 3D rotations. When mujoco_rest_zero=False (default), raw angles are used
+        (baked-with-quat). When True, angles are relative to rest (0 = T-pose in MuJoCo).
+        """
+        device = local_rot_mats.device
+        dtype = local_rot_mats.dtype
+        # Transform to f2q frame and extract 1-DoF angles (axis-angle projection).
+        local_rot_f2q = torch.matmul(self._rot_offsets_f2q.to(device=device, dtype=dtype), local_rot_mats)
+        hinge_rots = local_rot_f2q[:, :, self._mujoco_indices_to_kimodo_indices, :, :]
+        axis_f2q = self._mujoco_joint_axis_values_f2q_space.to(device=device, dtype=dtype)
+        joint_dofs = self._local_rots_to_joint_dofs_axis_angle(hinge_rots, axis_f2q)
+        # Optionally express angles relative to rest (MuJoCo q=0 at T-pose).
+        if mujoco_rest_zero:
+            rest_dofs = self._rest_dofs_axis_angle.to(device=device, dtype=dtype)
+            angles = joint_dofs - rest_dofs[None, None, :]
+            use_relative = True
+        else:
+            angles = joint_dofs
+            use_relative = False
+        if clamp_to_limits:
+            if mujoco_rest_zero:
+                angles = self._clamp_to_limits(angles)
+            else:
+                rest_dofs_aa = self._rest_dofs_axis_angle.to(device=device, dtype=dtype)
+                angles = self._clamp_joint_dofs(angles, rest_dofs_aa)
+        # Reconstruct 3D local rotations from 1-DoF angles and run FK.
+        local_rot_mats_proj = self._joint_dofs_to_local_rot_mats(
+            angles, local_rot_mats, device, dtype, use_relative=use_relative
+        )
+        global_rot_mats, posed_joints, _ = self.skeleton.fk(local_rot_mats_proj, root_positions)
+        return {
+            "local_rot_mats": local_rot_mats_proj,
+            "global_rot_mats": global_rot_mats,
+            "posed_joints": posed_joints,
+            "root_positions": root_positions,
+        }
+    @ensure_batched(local_rot_mats=5, root_positions=3, lengths=1)
+    def to_qpos(
+        self,
+        local_rot_mats: torch.Tensor,
+        root_positions: torch.Tensor,
+        root_quat_w_first: bool = True,
+        mujoco_rest_zero: bool = False,
+    ) -> torch.Tensor:
+        """Fast batch conversion from kimodo features to mujoco qpos format.
+        Args:
+            local_rot_mats: (B, T, J, 3, 3) local rotation matrices (kimodo convention).
+            root_positions: (B, T, 3) root positions.
+            root_quat_w_first: If True, quaternion in qpos is (w,x,y,z).
+            mujoco_rest_zero: If True, joint angles are written so that kimodo rest (t-pose)
+                maps to q=0 in MuJoCo. If False, write raw joint_dofs.
+        Returns:
+            torch.Tensor of shape [batch, numFrames, 36] containing mujoco qpos data:
+            - root_trans (3) + root_quat (4) + joint_dofs (29) = 36 columns
+        """
+        batch_size, num_frames, nb_joints = local_rot_mats.shape[:3]
+        device, dtype = local_rot_mats.device, local_rot_mats.dtype
+        local_rot_mats = torch.matmul(self._rot_offsets_f2q.to(device), local_rot_mats)
+        batch_size, num_frames = root_positions.shape[0], root_positions.shape[1]
+        # Move precomputed matrices to the same device/dtype
+        kimodo_to_mujoco_matrix = self.kimodo_to_mujoco_matrix.to(device=device, dtype=dtype)
+        # Initialize output tensor: [batch, numFrames, 36]
+        qpos = torch.zeros((batch_size, num_frames, 36), dtype=dtype, device=device)
+        # Convert root translation: apply coordinate transformation
+        root_positions_mujoco = torch.matmul(kimodo_to_mujoco_matrix[None, None, ...], root_positions[..., None])
+        qpos[:, :, :3] = root_positions_mujoco.view(batch_size, num_frames, 3)
+        # Convert root rotation: apply coordinate transformation to rotation matrix
+        root_rot = local_rot_mats[:, :, 0, :]  # [batch, numFrames, 3, 3]
+        # Apply coordinate transformation: R_mujoco = kimodo_to_mujoco * R_kimodo * kimodo_to_mujoco^T
+        mujoco_to_kimodo_matrix = kimodo_to_mujoco_matrix.T
+        root_rot_mujoco = torch.matmul(
+            torch.matmul(kimodo_to_mujoco_matrix[None, None, ...], root_rot),
+            mujoco_to_kimodo_matrix[None, None, ...],
+        )
+        root_rot_quat = matrix_to_quaternion(root_rot_mujoco)  # [w, x, y, z]
+        if root_quat_w_first:
+            qpos[:, :, 3:7] = root_rot_quat[:, :, [0, 1, 2, 3]]  # [w, x, y, z]
+        else:
+            qpos[:, :, 3:7] = root_rot_quat[:, :, [1, 2, 3, 0]]  # [w, x, y, z] -> [x, y, z, w]
+        # Joint DOFs: raw angles or relative to rest (rest = q=0 in MuJoCo).
+        joint_rot_f2q = local_rot_mats[:, :, self._mujoco_indices_to_kimodo_indices, :, :]
+        joint_dofs = self._local_rots_f2q_to_joint_dofs(joint_rot_f2q)
+        if mujoco_rest_zero:
+            rest_dofs = self._rest_dofs.to(device=device, dtype=dtype)
+            qpos[:, :, 7:] = joint_dofs - rest_dofs[None, None, :]
+        else:
+            qpos[:, :, 7:] = joint_dofs
+        return qpos
+def apply_g1_real_robot_projection(
+    skeleton: G1Skeleton34,
+    joints_pos: torch.Tensor,
+    joints_rot: torch.Tensor,
+    clamp_to_limits: bool = True,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Project G1 motion to real robot DoF (1-DoF per joint) with optional axis limits.
+    Extracts a single angle per hinge along its axis (1-DoF), optionally clamps to
+    joint limits from the MuJoCo XML (when clamp_to_limits=True), then reconstructs
+    3D rotations and runs FK. T-pose (identity local rotations) is preserved.
+    Args:
+        skeleton: G1 skeleton instance.
+        joints_pos: (T, J, 3) or (B, T, J, 3) joint positions in global space.
+        joints_rot: (T, J, 3, 3) or (B, T, J, 3, 3) global rotation matrices.
+        clamp_to_limits: If True, clamp joint angles to XML axis limits (default True).
+    Returns:
+        (posed_joints, global_rot_mats) as tensors, same shape as inputs (batch preserved).
+    """
+    local_rot_mats = global_rots_to_local_rots(joints_rot, skeleton)
+    root_positions = joints_pos[..., skeleton.root_idx, :]
+    # Converter expects batch dim (B, T, ...); add and remove if single sequence.
+    single_sequence = local_rot_mats.dim() == 4
+    if single_sequence:
+        local_rot_mats = local_rot_mats.unsqueeze(0)
+        root_positions = root_positions.unsqueeze(0)
+    converter = MujocoQposConverter(skeleton)
+    projected = converter.project_to_real_robot_rotations(
+        local_rot_mats, root_positions, clamp_to_limits=clamp_to_limits
+    )
+    out_pos = projected["posed_joints"]
+    out_rot = projected["global_rot_mats"]
+    if single_sequence:
+        out_pos = out_pos.squeeze(0)
+        out_rot = out_rot.squeeze(0)
+    return out_pos, out_rot

kimodo/exports/smplx.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Convert kimodo motion to AMASS/SMPL-X compatible parameters (axis-angle, Y-up or Z-up)."""
+import os
+from typing import Optional
+import einops
+import numpy as np
+import torch
+from kimodo.assets import skeleton_asset_path
+from kimodo.geometry import axis_angle_to_matrix, matrix_to_axis_angle
+from kimodo.tools import ensure_batched, to_numpy, to_torch
+def kimodo_y_up_to_amass_coord_rotation_matrix() -> np.ndarray:
+    """3x3 rotation mapping Kimodo Y-up (+Z forward) to AMASS Z-up (+Y forward).
+    Used by :func:`get_amass_parameters` and :func:`amass_arrays_to_kimodo_motion` (inverse).
+    """
+    y_up_to_z_up = np.array(
+        [
+            [1.0, 0.0, 0.0],
+            [0.0, 0.0, -1.0],
+            [0.0, 1.0, 0.0],
+        ],
+        dtype=np.float32,
+    )
+    rot_z_180 = np.array(
+        [
+            [-1.0, 0.0, 0.0],
+            [0.0, -1.0, 0.0],
+            [0.0, 0.0, 1.0],
+        ],
+        dtype=np.float32,
+    )
+    return np.matmul(rot_z_180, y_up_to_z_up).astype(np.float32)
+@ensure_batched(local_rot_mats=5, root_positions=3, lengths=1)
+def get_amass_parameters(
+    local_rot_mats,
+    root_positions,
+    skeleton,
+    z_up=True,
+):
+    """Convert local rot mats and root positions to AMASS-style trans and pose_body; optional z_up
+    coordinate transform.
+    Our method generates motions with Y-up and +Z forward; if z_up=True, transform to Z-up and +Y
+    forward as in AMASS.
+    """
+    # Our method generate motions with Y-up and +Z forward
+    # if z_up = True, we transform this to: Z-up with +Y forward, as in AMASS
+    # Remove the root offset; SMPL-X FK adds pelvis offset back.
+    pelvis_offset = skeleton.neutral_joints[skeleton.root_idx].cpu().numpy()
+    trans = root_positions - pelvis_offset
+    root_rot_mats = to_numpy(local_rot_mats[:, :, 0])
+    local_rot_axis_angle = to_numpy(matrix_to_axis_angle(to_torch(local_rot_mats)))
+    pose_body = einops.rearrange(local_rot_axis_angle[:, :, 1:], "b t j d -> b t (j d)")
+    # Optionally convert from Y-up to Z-up coordinates.
+    if z_up:
+        y_up_to_z_up = kimodo_y_up_to_amass_coord_rotation_matrix()
+        root_rot_mats = np.matmul(y_up_to_z_up, root_rot_mats)
+        trans = np.matmul(trans + pelvis_offset, y_up_to_z_up.T) - pelvis_offset
+    root_orient = to_numpy(matrix_to_axis_angle(to_torch(root_rot_mats)))
+    return trans, root_orient, pose_body
+def amass_arrays_to_kimodo_motion(
+    trans: np.ndarray,
+    root_orient: np.ndarray,
+    pose_body: np.ndarray,
+    skeleton,
+    source_fps: float,
+    *,
+    z_up: bool = True,
+):
+    """Inverse of :func:`get_amass_parameters` for a single sequence (AMASS → Kimodo motion dict).
+    Args:
+        trans: ``(T, 3)`` AMASS root translation (same as ``trans`` in AMASS NPZ).
+        root_orient: ``(T, 3)`` axis-angle root orientation in AMASS coordinates (z-up when ``z_up``).
+        pose_body: ``(T, 63)`` body pose axis-angle (21 joints × 3).
+        skeleton: :class:`~kimodo.skeleton.definitions.SMPLXSkeleton22` instance.
+        source_fps: Source frame rate (Hz) of the AMASS recording.
+        z_up: If ``True``, invert the same Y-up↔Z-up transform as ``get_amass_parameters(..., z_up=True)``.
+    Returns:
+        Motion dict compatible with :func:`kimodo.exports.motion_io.save_kimodo_npz`.
+    """
+    from kimodo.exports.motion_io import complete_motion_dict
+    trans = np.asarray(trans, dtype=np.float32)
+    root_orient = np.asarray(root_orient, dtype=np.float32)
+    pose_body = np.asarray(pose_body, dtype=np.float32)
+    if trans.ndim != 2 or trans.shape[-1] != 3:
+        raise ValueError(f"trans must be (T, 3); got {trans.shape}")
+    if root_orient.shape != trans.shape:
+        raise ValueError(f"root_orient shape {root_orient.shape} must match trans {trans.shape}")
+    t = trans.shape[0]
+    if pose_body.shape != (t, 63):
+        raise ValueError(f"pose_body must be (T, 63); got {pose_body.shape}")
+    pelvis_offset = skeleton.neutral_joints[skeleton.root_idx].detach().cpu().numpy().astype(np.float32)
+    device = skeleton.neutral_joints.device
+    dtype = torch.float32
+    Y_np = kimodo_y_up_to_amass_coord_rotation_matrix()
+    if z_up:
+        y_up_to_z_up = torch.from_numpy(Y_np).to(device=device, dtype=dtype)
+        # trans_amass = root_kimodo @ Y.T - pelvis_offset  =>  root_kimodo = (trans_amass + pelvis_offset) @ Y
+        root_positions_np = (trans + pelvis_offset) @ Y_np
+    else:
+        root_positions_np = trans + pelvis_offset
+    root_positions = torch.from_numpy(root_positions_np).to(device=device, dtype=dtype)
+    R_amass_root = axis_angle_to_matrix(torch.from_numpy(root_orient).to(device=device, dtype=dtype))
+    if z_up:
+        R_kimodo_root = torch.einsum("ij,tjk->tik", y_up_to_z_up.T, R_amass_root)
+    else:
+        R_kimodo_root = R_amass_root
+    nb = skeleton.nbjoints
+    if nb != 22:
+        raise ValueError(f"Expected SMPL-X body skeleton with 22 joints; got {nb}")
+    local_rot_mats = torch.zeros((t, nb, 3, 3), device=device, dtype=dtype)
+    local_rot_mats[:, 0] = R_kimodo_root
+    pose_aa = torch.from_numpy(pose_body.reshape(t, 21, 3)).to(device=device, dtype=dtype)
+    local_rot_mats[:, 1:] = axis_angle_to_matrix(pose_aa.reshape(-1, 3)).reshape(t, 21, 3, 3)
+    return complete_motion_dict(local_rot_mats, root_positions, skeleton, source_fps)
+def amass_npz_to_kimodo_motion(npz_path: str, skeleton, source_fps: Optional[float] = None, *, z_up: bool = True):
+    """Load an AMASS-style ``.npz`` and return a Kimodo motion dict.
+    Args:
+        npz_path: Path to AMASS NPZ (``trans``, ``root_orient``, ``pose_body``, ...).
+        skeleton: SMPL-X skeleton instance.
+        source_fps: Source frame rate (Hz); if ``None``, uses ``mocap_frame_rate``
+            from the file when present, else ``30.0``.
+        z_up: Same meaning as :func:`amass_arrays_to_kimodo_motion`.
+    """
+    with np.load(npz_path, allow_pickle=True) as data:
+        trans = np.asarray(data["trans"], dtype=np.float32)
+        root_orient = np.asarray(data["root_orient"], dtype=np.float32)
+        pose_body = np.asarray(data["pose_body"], dtype=np.float32)
+        if source_fps is None:
+            source_fps = float(data["mocap_frame_rate"]) if "mocap_frame_rate" in data.files else 30.0
+    return amass_arrays_to_kimodo_motion(trans, root_orient, pose_body, skeleton, source_fps, z_up=z_up)
+class AMASSConverter:
+    def __init__(
+        self,
+        fps,
+        skeleton,
+        beta_path=str(skeleton_asset_path("smplx22", "beta.npy")),
+        mean_hands_path=str(skeleton_asset_path("smplx22", "mean_hands.npy")),
+    ):
+        self.fps = fps
+        self.skeleton = skeleton
+        # Load betas
+        if os.path.exists(beta_path):
+            # only use first 16 betas to match AMASS
+            betas = np.load(beta_path)[:16]
+        else:
+            betas = np.zeros(16)
+        # Load mean hands
+        if os.path.exists(mean_hands_path):
+            mean_hands = np.load(mean_hands_path)
+        else:
+            mean_hands = np.zeros(90)
+        self.default_frame_params = {
+            "pose_jaw": np.zeros(3),
+            "pose_eye": np.zeros(6),
+            "pose_hand": mean_hands,
+        }
+        self.output_dict_base = {
+            "gender": "neutral",
+            "surface_model_type": "smplx",
+            "betas": betas,
+            "num_betas": len(betas),
+            "mocap_frame_rate": float(fps),
+        }
+    def convert_save_npz(self, output: dict, npz_path, z_up=True):
+        trans, root_orient, pose_body = get_amass_parameters(
+            output["local_rot_mats"],
+            output["root_positions"],
+            self.skeleton,
+            z_up=z_up,
+        )
+        nb_frames = trans.shape[-2]
+        amass_output_base = self.output_dict_base.copy()
+        for key, val in self.default_frame_params.items():
+            amass_output_base[key] = einops.repeat(val, "d -> t d", t=nb_frames)
+        amass_output_base["mocap_time_length"] = nb_frames / self.fps
+        self.save_npz(trans, root_orient, pose_body, amass_output_base, npz_path)
+    def save_npz(self, trans, root_orient, pose_body, base_output, npz_path):
+        shape = trans.shape
+        if len(shape) == 3 and shape[0] == 1:
+            # if only one motion, squeeze the data
+            trans = trans[0]
+            root_orient = root_orient[0]
+            pose_body = pose_body[0]
+            shape = trans.shape
+        if len(shape) == 2:
+            amass_output = {
+                "trans": trans,
+                "root_orient": root_orient,
+                "pose_body": pose_body,
+            } | base_output
+            np.savez(npz_path, **amass_output)
+        elif len(shape) == 3:
+            # real batch of motions
+            npz_path_base, ext = os.path.splitext(npz_path)
+            for i in range(shape[0]):
+                npz_path_i = npz_path_base + "_" + str(i).zfill(2) + ext
+                self.save_npz(trans[i], root_orient[i], pose_body[i], base_output, npz_path_i)
+# amass_output = {
+#     "gender": "neutral",
+#     "surface_model_type": "smplx",
+#     "mocap_frame_rate": float(fps),
+#     "mocap_time_length": len(motion) / float(fps)
+#     "trans": trans,
+#     "betas": betas,
+#     "num_betas": len(betas),
+#     "root_orient": np.array([T, 3]), # axis angle
+#     "pose_body": np.array([T, 63]), # 63=21*3, axis angle 21 = 22 - root
+#     "pose_hand": np.array([T, 90]), # 90=30*3=15*2*3 axis angle (load from mean_hands)
+#     "pose_jaw": np.array([T, 3]), # all zeros is fine
+#     "pose_eye": np.array([T, 6]), # all zeros is fine`
+# }

kimodo/geometry.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Rotation and representation conversions: axis-angle, quaternion, matrix, 6D continuous."""
+import torch
+import torch.nn.functional as F
+def angle_to_Y_rotation_matrix(angle: torch.Tensor) -> torch.Tensor:
+    """Build a rotation matrix around the Y axis from a scalar angle (radians).
+    Shape: angle.shape + (3, 3).
+    """
+    cos, sin = torch.cos(angle), torch.sin(angle)
+    one, zero = torch.ones_like(angle), torch.zeros_like(angle)
+    mat = torch.stack((cos, zero, sin, zero, one, zero, -sin, zero, cos), -1)
+    mat = mat.reshape(angle.shape + (3, 3))
+    return mat
+def matrix_to_cont6d(matrix: torch.Tensor) -> torch.Tensor:
+    """Convert rotation matrix to 6D continuous representation (first two columns).
+    Shape: (..., 3, 3) -> (..., 6).
+    """
+    cont_6d = torch.concat([matrix[..., 0], matrix[..., 1]], dim=-1)
+    return cont_6d
+def cont6d_to_matrix(cont6d: torch.Tensor) -> torch.Tensor:
+    """Convert 6D continuous representation to rotation matrix (Gram–Schmidt on two columns).
+    Last dim must be 6.
+    """
+    assert cont6d.shape[-1] == 6, "The last dimension must be 6"
+    x_raw = cont6d[..., 0:3]
+    y_raw = cont6d[..., 3:6]
+    x = x_raw / torch.norm(x_raw, dim=-1, keepdim=True)
+    z = torch.cross(x, y_raw, dim=-1)
+    z = z / torch.norm(z, dim=-1, keepdim=True)
+    y = torch.cross(z, x, dim=-1)
+    x = x[..., None]
+    y = y[..., None]
+    z = z[..., None]
+    mat = torch.cat([x, y, z], dim=-1)
+    return mat
+def axis_angle_to_matrix(axis_angle: torch.Tensor) -> torch.Tensor:
+    """Convert axis-angle to rotation matrix.
+    Args:
+        axis_angle: (..., 3) axis-angle vectors (angle = norm, axis = normalized)
+    Returns:
+        rotmat: (..., 3, 3) rotation matrices
+    """
+    eps = 1e-6
+    angle = torch.norm(axis_angle, dim=-1, keepdim=True)  # (..., 1)
+    axis = axis_angle / (angle + eps)
+    x, y, z = axis.unbind(-1)
+    zero = torch.zeros_like(x)
+    K = torch.stack([zero, -z, y, z, zero, -x, -y, x, zero], dim=-1).reshape(*axis.shape[:-1], 3, 3)
+    eye = torch.eye(3, device=axis.device, dtype=axis.dtype)
+    eye = eye.expand(*axis.shape[:-1], 3, 3)
+    sin = torch.sin(angle)[..., None]
+    cos = torch.cos(angle)[..., None]
+    R = eye + sin * K + (1 - cos) * (K @ K)
+    return R
+def matrix_to_axis_angle(R: torch.Tensor) -> torch.Tensor:
+    """Convert rotation matrix to axis-angle via quaternions (more numerically stable).
+    Args:
+        R: (..., 3, 3) rotation matrices
+    Returns:
+        axis_angle: (..., 3)
+    """
+    # Go through quaternions for numerical stability
+    quat = matrix_to_quaternion(R)  # (..., 4) with (w, x, y, z)
+    return quaternion_to_axis_angle(quat)
+def quaternion_to_axis_angle(quat: torch.Tensor) -> torch.Tensor:
+    """Convert quaternion to axis-angle representation.
+    Args:
+        quat: (..., 4) quaternions with real part first (w, x, y, z)
+    Returns:
+        axis_angle: (..., 3)
+    """
+    eps = 1e-6
+    # Ensure canonical form to avoid sign ambiguity.
+    # Primary: prefer w > 0. When w ≈ 0 (angle ≈ π), prefer first nonzero xyz > 0.
+    w = quat[..., 0:1]
+    xyz = quat[..., 1:]
+    # Find first significant component of xyz for tie-breaking when w ≈ 0
+    first_significant = xyz[..., 0:1]  # use x component as tie-breaker
+    # Flip if: w < 0, OR (w ≈ 0 AND first xyz component < 0)
+    should_flip = (w < -eps) | ((w.abs() <= eps) & (first_significant < 0))
+    quat = torch.where(should_flip, -quat, quat)
+    w = quat[..., 0]
+    xyz = quat[..., 1:]
+    # sin(angle/2) = ||xyz||
+    sin_half_angle = xyz.norm(dim=-1)
+    # angle = 2 * atan2(sin(angle/2), cos(angle/2))
+    # This is more stable than 2 * acos(w) near angle=0
+    angle = 2.0 * torch.atan2(sin_half_angle, w)
+    # axis = xyz / sin(angle/2), but handle small angles
+    # For small angles: axis-angle ≈ 2 * xyz (since sin(x) ≈ x for small x)
+    small_angle = sin_half_angle.abs() < eps
+    # Safe division
+    scale = torch.where(
+        small_angle,
+        2.0 * torch.ones_like(angle),  # small angle: axis_angle ≈ 2 * xyz
+        angle / sin_half_angle.clamp(min=eps),
+    )
+    return xyz * scale.unsqueeze(-1)
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """Returns torch.sqrt(torch.max(0, x)) subgradient is zero where x is 0."""
+    return torch.sqrt(x * (x > 0).to(x.dtype))
+def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
+    """Convert rotations given as rotation matrices to quaternions.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(matrix.reshape(batch_dim + (9,)), dim=-1)
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [
+                1.0 + m00 + m11 + m22,
+                1.0 + m00 - m11 - m22,
+                1.0 - m00 + m11 - m22,
+                1.0 - m00 - m11 + m22,
+            ],
+            dim=-1,
+        )
+    )
+    quat_by_rijk = torch.stack(
+        [
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+    return (
+        (F.one_hot(q_abs.argmax(dim=-1), num_classes=4)[..., None] * quat_candidates)
+        .sum(dim=-2)
+        .reshape(batch_dim + (4,))
+    )
+def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
+    """Convert rotations given as quaternions to rotation matrices.
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))

kimodo/meta.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Parse and normalize prompt text/duration data from meta dicts."""
+import os
+from typing import Any, Optional
+from kimodo.tools import load_json
+from .sanitize import sanitize_text, sanitize_texts
+def load_prompts_from_meta(meta_path: str, **kwargs):
+    """Load prompts from a meta dict or file. If fps is provided, the durations are converted to
+    frames.
+    Args:
+        meta_path: Path to the meta file.
+        **kwargs: Additional arguments to pass to parse_prompts_from_meta.
+    Returns:
+        texts: List of texts.
+        durations: List of durations in seconds or frames.
+    """
+    if not os.path.exists(meta_path):
+        raise FileNotFoundError(f"meta.json not found in input folder: {meta_path}")
+    meta = load_json(meta_path)
+    return parse_prompts_from_meta(meta, **kwargs)
+def parse_prompts_from_meta(
+    meta: dict[str, Any],
+    fps: Optional[float] = None,
+    sanitize: bool = False,
+) -> tuple[list[str], list[float]]:
+    """Parse prompt texts and durations from a meta dict into normalized lists. If fps is provided,
+    the durations are converted to frames.
+    Accepts either:
+    - Single prompt: "text" (str) and "duration" (float) in seconds.
+    - Multiple prompts: "texts" (list of str) and "durations" (list of float) in seconds.
+    Returns:
+        (texts, durations): texts as list of str, durations as list of float (seconds or frames).
+        Lengths of both lists are equal.
+    Raises:
+        ValueError: If meta does not contain a recognized format.
+    """
+    # Single prompt
+    if "text" in meta and "duration" in meta:
+        text = meta["text"]
+        duration = float(meta["duration"])
+        if fps is not None:
+            duration = int(duration * fps)
+        if isinstance(text, list):
+            raise ValueError("meta has 'text' but it is a list; use 'texts' for multiple prompts")
+        if sanitize:
+            text = sanitize_text(text)
+        return ([text], [duration])
+    # Multiple prompts
+    if "texts" in meta and "durations" in meta:
+        texts = meta["texts"]
+        durations = meta["durations"]
+        if not isinstance(texts, list) or not isinstance(durations, list):
+            raise ValueError("meta 'texts' and 'durations' must be lists")
+        if len(texts) != len(durations):
+            raise ValueError(f"meta 'texts' and 'durations' length mismatch: {len(texts)} vs {len(durations)}")
+        durations = [float(d) for d in durations]
+        if fps is not None:
+            durations = [int(d * fps) for d in durations]
+        if sanitize:
+            texts = sanitize_texts(texts)
+        return texts, durations
+    raise ValueError("meta must contain either 'text' and 'duration', or 'texts' and 'durations'.")

kimodo/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Evaluation metrics for motion quality (foot skate, contact consistency, constraint following)."""
+from .base import (
+    Metric,
+    aggregate_metrics,
+    clear_metrics,
+    compute_metrics,
+)
+from .constraints import ContraintFollow
+from .foot_skate import (
+    FootContactConsistency,
+    FootSkateFromContacts,
+    FootSkateFromHeight,
+    FootSkateRatio,
+)
+from .tmr import (
+    TMR_EmbeddingMetric,
+    TMR_Metric,
+    compute_tmr_per_sample_retrieval,
+    compute_tmr_retrieval_metrics,
+)
+__all__ = [
+    "Metric",
+    "ContraintFollow",
+    "FootContactConsistency",
+    "FootSkateFromContacts",
+    "FootSkateFromHeight",
+    "FootSkateRatio",
+    "TMR_EmbeddingMetric",
+    "TMR_Metric",
+    "aggregate_metrics",
+    "clear_metrics",
+    "compute_metrics",
+    "compute_tmr_per_sample_retrieval",
+    "compute_tmr_retrieval_metrics",
+]

kimodo/metrics/base.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Base metric class and batch/aggregate helpers."""
+from __future__ import annotations
+from collections import defaultdict
+from typing import Dict, List
+import torch
+class Metric:
+    """Base class for metrics that accumulate results over multiple __call__ and expose
+    aggregate()."""
+    def __init__(self, **kwargs):
+        self.clear()
+    def __call__(self, *args, **kwargs):
+        """Compute metric for current batch, append to saved_metrics, and return the batch
+        result."""
+        metrics = self._compute(*args, **kwargs)
+        for key, val in metrics.items():
+            self.saved_metrics[key].append(val.detach().cpu().float())
+        return metrics
+    def _compute(self, **kwargs):
+        """Subclasses implement this to compute metric dict from batch inputs."""
+        raise NotImplementedError()
+    def clear(self):
+        """Reset all accumulated metric values."""
+        self.saved_metrics = defaultdict(list)
+    def aggregate(self):
+        """Return a dict of concatenated/stacked tensors over all accumulated batches."""
+        output = {}
+        for key, lst in self.saved_metrics.items():
+            try:
+                output[key] = torch.cat(lst)
+            except RuntimeError:
+                output[key] = torch.stack(lst)
+        return output
+def compute_metrics(metrics_list: List[Metric], metrics_in: Dict) -> Dict:
+    """Run each metric on metrics_in and return the combined dict of batch results."""
+    metrics_out = {}
+    for metric in metrics_list:
+        metrics_out.update(metric(**metrics_in))
+    return metrics_out
+def aggregate_metrics(metrics_list: List[Metric]) -> Dict:
+    """Return combined aggregated results (concatenated over batches) for all metrics."""
+    metrics_out = {}
+    for metric in metrics_list:
+        metrics_out.update(metric.aggregate())
+    return metrics_out
+def clear_metrics(metrics_list: List[Metric]) -> None:
+    """Clear accumulated values for all metrics in the list."""
+    for metric in metrics_list:
+        metric.clear()

kimodo/metrics/constraints.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Constraint-following metrics."""
+from __future__ import annotations
+from collections import defaultdict
+from typing import Dict, List, Optional
+import torch
+from torch import Tensor
+from kimodo.constraints import (
+    EndEffectorConstraintSet,
+    FullBodyConstraintSet,
+    Root2DConstraintSet,
+)
+from kimodo.tools import ensure_batched
+from .base import Metric
+class ContraintFollow(Metric):
+    """Constraint-following metric dispatcher for kimodo constraint sets."""
+    def __init__(
+        self,
+        skeleton,
+        root_threshold: float = 0.10,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.skeleton = skeleton
+        self.root_threshold = root_threshold
+    @ensure_batched(posed_joints=4, constraints_lst=2, lengths=1)
+    def _compute(
+        self,
+        posed_joints: Tensor,
+        constraints_lst: Optional[List],
+        lengths: Optional[Tensor] = None,
+        **kwargs,
+    ) -> Dict:
+        if not constraints_lst:
+            return {}
+        root_idx = self.skeleton.root_idx
+        output = defaultdict(list)
+        for posed_joints_s, constraint_lst_s, lengths_s in zip(posed_joints, constraints_lst, lengths):
+            output_seq = defaultdict(list)
+            for constraint in constraint_lst_s:
+                frame_idx = constraint.frame_indices.to(device=posed_joints_s.device, dtype=torch.long)
+                assert frame_idx.max() < lengths_s, "The constraint is defined outsite the lenght of the motion."
+                if frame_idx.numel() == 0:
+                    continue
+                if isinstance(constraint, Root2DConstraintSet):
+                    pred_root2d = posed_joints_s[frame_idx, root_idx][:, [0, 2]]
+                    target = constraint.smooth_root_2d.to(posed_joints_s.device)
+                    dist = torch.norm(pred_root2d - target, dim=-1)
+                    output_seq["constraint_root2d_err"].append(dist)
+                    hit = (dist <= self.root_threshold).float()
+                    output_seq["constraint_root2d_acc"].append(hit)
+                elif isinstance(constraint, FullBodyConstraintSet):
+                    pred = posed_joints_s[frame_idx]
+                    target = constraint.global_joints_positions.to(posed_joints_s.device)
+                    err = torch.norm(pred - target, dim=-1)
+                    output_seq["constraint_fullbody_keyframe"].append(err)
+                elif isinstance(constraint, EndEffectorConstraintSet):
+                    pos_idx = constraint.pos_indices.to(device=posed_joints_s.device, dtype=torch.long)
+                    pred = posed_joints_s[frame_idx].index_select(1, pos_idx)
+                    target = constraint.global_joints_positions.to(posed_joints_s.device).index_select(1, pos_idx)
+                    err = torch.norm(pred - target, dim=-1)
+                    output_seq["constraint_end_effector"].append(err)
+            # in case we have several same constraints in the list
+            for key, val in output_seq.items():
+                output[key].append(torch.cat(val).mean())
+        reduced = {}
+        for key, vals in output.items():
+            reduced[key] = torch.stack(vals, dim=0)
+        return reduced

kimodo/metrics/foot_skate.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Foot skate and contact consistency metrics."""
+from __future__ import annotations
+from typing import Dict, Optional
+import torch
+from torch import Tensor
+from kimodo.motion_rep.feature_utils import compute_vel_xyz
+from kimodo.motion_rep.feet import foot_detect_from_pos_and_vel
+from kimodo.skeleton import SkeletonBase
+from kimodo.tools import ensure_batched
+from .base import Metric
+class FootSkateFromHeight(Metric):
+    """When toe joint is near the floor, measures mean velocity of the toes."""
+    def __init__(
+        self,
+        skeleton: SkeletonBase,
+        fps: float,
+        height_thresh: float = 0.05,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.height_thresh = height_thresh
+        self.skeleton = skeleton
+        self.fps = fps
+    @ensure_batched(posed_joints=4, lengths=1)
+    def _compute(
+        self,
+        posed_joints: Tensor,
+        lengths: Optional[Tensor] = None,
+        **kwargs,
+    ) -> Dict:
+        fidx = self.skeleton.foot_joint_idx
+        if len(fidx) != 4:
+            raise ValueError("FootSkateFromHeight expects four foot joints (heel/toe per foot)")
+        feet_pos = posed_joints[:, :, fidx]
+        toe_pos = feet_pos[:, :, [1, 3]]
+        toe_on_floor = (toe_pos[..., 1] < self.height_thresh)[:, :-1]  # y-up [B, T, 2] where [left right]
+        dt = 1.0 / self.fps
+        toe_vel = torch.norm(toe_pos[:, 1:] - toe_pos[:, :-1], dim=-1) / dt  # [B, nframes-1, 2]
+        # compute err
+        contact_toe_vel = toe_vel * toe_on_floor  # vel when corresponding toe is on ground
+        # account for generated length
+        # since they are velocities use length-1 to avoid inaccurate vel going one frame past len
+        device = toe_on_floor.device
+        len_mask = torch.arange(toe_on_floor.shape[1], device=device)[None, :, None].expand(toe_on_floor.shape) < (
+            lengths[:, None, None] - 1
+        )
+        toe_on_floor = toe_on_floor * len_mask
+        contact_toe_vel = contact_toe_vel * len_mask
+        mean_vel = torch.sum(contact_toe_vel, (1, 2)) / (torch.sum(toe_on_floor, (1, 2)) + 1e-6)
+        return {"foot_skate_from_height": mean_vel}
+class FootSkateFromContacts(Metric):
+    """Measures velocity of the toes and ankles when predicted to be in contact."""
+    def __init__(
+        self,
+        skeleton: SkeletonBase,
+        fps: float,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.skeleton = skeleton
+        self.fps = fps
+    @ensure_batched(posed_joints=4, foot_contacts=3, lengths=1)
+    def _compute(
+        self,
+        posed_joints: Tensor,
+        foot_contacts: Tensor,
+        lengths: Optional[Tensor] = None,
+        **kwargs,
+    ) -> Dict:
+        fidx = self.skeleton.foot_joint_idx
+        feet_pos = posed_joints[:, :, fidx]
+        dt = 1.0 / self.fps
+        foot_vel = torch.norm(feet_pos[:, 1:] - feet_pos[:, :-1], dim=-1) / dt
+        foot_contacts = foot_contacts[:, :-1]
+        vel_err = foot_vel * foot_contacts
+        # account for generated length
+        # since they are velocities use length-1 to avoid inaccurate vel going one frame past len
+        device = foot_contacts.device
+        len_mask = torch.arange(foot_contacts.shape[1], device=device)[None, :, None].expand(foot_contacts.shape) < (
+            lengths[:, None, None] - 1
+        )
+        foot_contacts = foot_contacts * len_mask
+        vel_err = vel_err * len_mask
+        mean_vel = torch.sum(vel_err, (1, 2)) / (torch.sum(foot_contacts, (1, 2)) + 1e-6)  # mean over contacting frames
+        # Compute max velocity error across all feet and frames (per batch)
+        max_vel = vel_err.amax(dim=(1, 2))  # [B]
+        return {
+            "foot_skate_from_pred_contacts": mean_vel,
+            "foot_skate_max_vel": max_vel,
+        }
+class FootSkateRatio(Metric):
+    """Compute fraction of frames where the foot skates when it is on the ground.
+    Inspired by GMD: https://github.com/korrawe/guided-motion-diffusion/blob/main/data_loaders/humanml/utils/metrics.py#L204
+    """
+    def __init__(
+        self,
+        skeleton: SkeletonBase,
+        fps: float,
+        height_thresh=0.05,
+        vel_thresh=0.2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.height_thresh = height_thresh
+        self.vel_thresh = vel_thresh
+        self.skeleton = skeleton
+        self.fps = fps
+    @ensure_batched(posed_joints=4, foot_contacts=3, lengths=1)
+    def _compute(
+        self,
+        posed_joints: Tensor,
+        foot_contacts: Tensor,
+        lengths: Optional[Tensor] = None,
+        **kwargs,
+    ) -> Dict:
+        fidx = self.skeleton.foot_joint_idx
+        assert len(fidx) == 4, "This metric assumes 4 foot joints: heel, toe, heel, toe"
+        feet_pos = posed_joints[:, :, fidx]
+        toe_pos = feet_pos[:, :, [1, 3]]
+        toe_on_floor = toe_pos[..., 1] < self.height_thresh  # y-up [B, T, 2] where [left right]
+        # current and next frame on floor to consider it in contact
+        toe_on_floor = torch.logical_and(toe_on_floor[:, :-1], toe_on_floor[:, 1:])  # [B, T-1, 2]
+        dt = 1.0 / self.fps
+        toe_vel = torch.norm(toe_pos[:, 1:] - toe_pos[:, :-1], dim=-1) / dt  # [B, nframes-1, 2]
+        # compute err
+        contact_toe_vel = toe_vel * toe_on_floor  # vel when corresponding toe is on ground
+        # account for generated length
+        # since they are velocities use length-1 to avoid inaccurate vel going one frame past len
+        device = toe_on_floor.device
+        len_mask = torch.arange(toe_on_floor.shape[1], device=device)[None, :, None].expand(toe_on_floor.shape) < (
+            lengths[:, None, None] - 1
+        )
+        toe_on_floor = toe_on_floor * len_mask
+        contact_toe_vel = contact_toe_vel * len_mask
+        # skating if velocity during contact > thresh
+        toe_skate = contact_toe_vel > self.vel_thresh
+        skate_ratio = torch.sum(toe_skate, (1, 2)) / (torch.sum(toe_on_floor, (1, 2)) + 1e-6)
+        return {"foot_skate_ratio": skate_ratio}
+class FootContactConsistency(Metric):
+    """Measures consistency between heuristic detected foot contacts (from height and velocity) and
+    predicted foot contacts.
+    i.e. accuracy of how well predicted matches heuristic.
+    """
+    def __init__(
+        self,
+        skeleton: SkeletonBase,
+        fps: float,
+        vel_thresh: float = 0.15,
+        height_thresh: float = 0.10,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vel_thresh = vel_thresh
+        self.height_thresh = height_thresh
+        self.skeleton = skeleton
+        self.fps = fps
+    @ensure_batched(posed_joints=4, foot_contacts=3, lengths=1)
+    def _compute(
+        self,
+        posed_joints: Tensor,
+        foot_contacts: Tensor,
+        lengths: Optional[Tensor] = None,
+        **kwargs,
+    ) -> Dict:
+        velocity = compute_vel_xyz(posed_joints, float(self.fps), lengths=lengths)
+        heuristic_contacts = foot_detect_from_pos_and_vel(
+            posed_joints,
+            velocity,
+            self.skeleton,
+            self.vel_thresh,
+            self.height_thresh,
+        )
+        # compute accuracy of predicted, treating heuristic as ground truth
+        num_contacts = foot_contacts.shape[-1]
+        incorrect = torch.logical_xor(heuristic_contacts, foot_contacts)
+        # account for generated length
+        # since they are velocities, use length-1 to avoid inaccurate vel going one frame past len
+        device = foot_contacts.device
+        len_mask = torch.arange(foot_contacts.shape[1], device=device)[None, :, None].expand(foot_contacts.shape) < (
+            lengths[:, None, None] - 1
+        )
+        incorrect = incorrect * len_mask
+        incorrect_ratio = torch.sum(incorrect, (1, 2)) / (num_contacts * (lengths - 1))
+        accuracy = 1 - incorrect_ratio
+        return {"foot_contact_consistency": accuracy}

kimodo/metrics/tmr.py ADDED Viewed

	@@ -0,0 +1,530 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""TMR evaluation metrics: text-motion retrieval, R-Precision, and related scores."""
+from __future__ import annotations
+from collections import defaultdict
+from typing import Any, Dict, List, Optional
+import numpy as np
+import torch
+from scipy import linalg
+from torch import Tensor
+from kimodo.model.tmr import TMR
+from .base import Metric
+# Scores are between 0 and 1
+def get_score_matrix_unit(x, y):
+    sim_matrix = np.einsum("b i, c i -> b c", x, y)
+    scores = sim_matrix / 2 + 0.5
+    return scores
+def get_scores_unit(x, y):
+    similarity = np.einsum("... i, ... i", x, y)
+    scores = similarity / 2 + 0.5
+    return scores
+def compute_tmr_per_sample_retrieval(
+    motion_emb: np.ndarray,
+    text_emb: np.ndarray,
+    sample_ids: List[str],
+    texts: List[str],
+    top_k: int = 5,
+) -> List[Dict[str, Any]]:
+    """For each sample (text query i), compute t2m rank of motion i and top-k retrieved motions with
+    ids and texts.
+    Returns list of dicts: [{"rank": int, "top_k": [{"id": str, "text": str}, ...]}, ...].
+    """
+    motion_emb = np.asarray(motion_emb).squeeze()
+    text_emb = np.asarray(text_emb).squeeze()
+    if motion_emb.ndim == 1:
+        motion_emb = motion_emb[np.newaxis, :]
+    if text_emb.ndim == 1:
+        text_emb = text_emb[np.newaxis, :]
+    n = motion_emb.shape[0]
+    assert text_emb.shape[0] == n and len(sample_ids) == n and len(texts) == n
+    scores = get_score_matrix_unit(text_emb, motion_emb)
+    out: List[Dict[str, Any]] = []
+    for i in range(n):
+        row = np.asarray(scores[i])
+        order = np.argsort(row)[::-1]
+        rank = int(np.where(order == i)[0][0]) + 1
+        top_indices = order[:top_k]
+        top_k_list = [{"id": sample_ids[j], "text": texts[j]} for j in top_indices]
+        out.append({"rank": rank, "top_k": top_k_list})
+    return out
+class TMR_Metric(Metric):
+    def __init__(
+        self,
+        tmr_model: TMR,
+        ranks: List = [1, 2, 3, 5, 10],
+        ranks_rounding=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.tmr_model = tmr_model
+        self.ranks = ranks
+        self.ranks_rounding = ranks_rounding
+    def clear(self):
+        self.saved_metrics = defaultdict(list)
+        self.saved_text_latents = []
+        self.saved_motion_gen_latents = []
+        self.saved_motion_gt_latents = []
+    def _compute(
+        self,
+        motion_rep,
+        pred_joints_output: Dict,
+        gt_joints_output: Dict,
+        text_x_dict: Dict,
+        lengths: Tensor,
+        **kwargs,
+    ) -> Dict:
+        pred_posed_joints = pred_joints_output["posed_joints"]
+        original_skeleton = motion_rep.skeleton if motion_rep is not None else None
+        latents_motion = self.tmr_model.encode_motion(
+            pred_posed_joints,
+            lengths=lengths,
+            original_skeleton=original_skeleton,
+            unit_vector=True,
+        )
+        latents_motion = latents_motion.cpu().numpy()
+        if isinstance(text_x_dict, dict) and "texts" in text_x_dict:
+            latents_text = self.tmr_model.encode_raw_text(text_x_dict["texts"], unit_vector=True)
+        else:
+            latents_text = self.tmr_model.encode_text(text_x_dict, unit_vector=True)
+        if latents_text.dim() == 1:
+            latents_text = latents_text.unsqueeze(0)
+        latents_text = latents_text.cpu().numpy()
+        self.saved_text_latents.append(latents_text)
+        self.saved_motion_gen_latents.append(latents_motion)
+        scores_text = get_scores_unit(latents_motion, latents_text)
+        output = {"TMR/t2m_sim": scores_text}
+        if gt_joints_output is not None and "posed_joints" in gt_joints_output:
+            gt_posed_joints = gt_joints_output["posed_joints"]
+            gt_latents_motion = self.tmr_model.encode_motion(
+                gt_posed_joints,
+                lengths=lengths,
+                original_skeleton=original_skeleton,
+                unit_vector=True,
+            )
+            gt_latents_motion = gt_latents_motion.cpu().numpy()
+            self.saved_motion_gt_latents.append(gt_latents_motion)
+            gt_scores_text = get_scores_unit(gt_latents_motion, latents_text)
+            scores_motion = get_scores_unit(latents_motion, gt_latents_motion)
+            output["TMR/t2m_gt_sim"] = gt_scores_text
+            output["TMR/m2m_sim"] = scores_motion
+        # pytorch tensors
+        for key, val in output.items():
+            output[key] = torch.tensor(val)
+        return output
+    def aggregate(self):
+        output = {}
+        for key, lst in self.saved_metrics.items():
+            output[key] = np.concatenate(lst)
+        assert self.saved_text_latents, "Should call the metric at least once."
+        text_latents = np.concatenate(self.saved_text_latents)
+        motion_gen_latents = np.concatenate(self.saved_motion_gen_latents)
+        batch_size = len(text_latents)
+        assert text_latents.shape == motion_gen_latents.shape
+        scores_t2m = get_score_matrix_unit(text_latents, motion_gen_latents)
+        scores_t2t = get_score_matrix_unit(text_latents, text_latents)
+        t2m_metrics = contrastive_metrics(
+            scores=scores_t2m,
+            scores_t2t=scores_t2t,
+            threshold=0.99,
+            rounding=2,
+        )
+        for key, val in t2m_metrics.items():
+            output["TMR/t2m_R/" + key] = val
+        mu_gen, cov_gen = calculate_activation_statistics(motion_gen_latents)
+        mu_text, cov_text = calculate_activation_statistics(text_latents)
+        fid_gen_text = calculate_frechet_distance(mu_gen, cov_gen, mu_text, cov_text)
+        output["TMR/FID/gen_text"] = fid_gen_text
+        if self.saved_motion_gt_latents:
+            motion_gt_latents = np.concatenate(self.saved_motion_gt_latents)
+            assert motion_gt_latents.shape == motion_gen_latents.shape
+            scores_m2gm = get_score_matrix_unit(motion_gen_latents, motion_gt_latents)
+            scores_t2gm = get_score_matrix_unit(text_latents, motion_gt_latents)
+            m2gm_metrics = contrastive_metrics(
+                scores=scores_m2gm,
+                scores_t2t=scores_t2t,
+                threshold=0.99,
+                rounding=2,
+            )
+            for key, val in m2gm_metrics.items():
+                output["TMR/m2m_R/" + key] = val
+            t2gm_metrics = contrastive_metrics(
+                scores=scores_t2gm,
+                scores_t2t=scores_t2t,
+                threshold=0.99,
+                rounding=2,
+            )
+            for key, val in t2gm_metrics.items():
+                output["TMR/t2m_gt_R/" + key] = val
+            mu_gt_motion, cov_gt_motion = calculate_activation_statistics(motion_gt_latents)
+            fid_gen_motion = calculate_frechet_distance(
+                mu_gen,
+                cov_gen,
+                mu_gt_motion,
+                cov_gt_motion,
+            )
+            output["TMR/FID/gen_gt"] = fid_gen_motion
+            fid_gt_text = calculate_frechet_distance(
+                mu_gt_motion,
+                cov_gt_motion,
+                mu_text,
+                cov_text,
+            )
+            output["TMR/FID/gt_text"] = fid_gt_text
+        for key, val in output.items():
+            if isinstance(val, (int, float, np.integer, np.floating)):
+                val = torch.tensor([val for _ in range(batch_size)])
+            if isinstance(val, np.ndarray):
+                val = torch.from_numpy(val)
+            output[key] = val.cpu().float()
+        return output
+class TMR_EmbeddingMetric(Metric):
+    """TMR metrics from precomputed motion and text embeddings (no model load).
+    Use in the loop: pass motion_emb and text_emb per sample; aggregate() computes retrieval metrics.
+    """
+    def __init__(self, ranks_rounding: int = 2, **kwargs):
+        super().__init__(**kwargs)
+        self.ranks_rounding = ranks_rounding
+    def clear(self):
+        self.saved_metrics = defaultdict(list)
+        self.saved_text_latents = []
+        self.saved_motion_gen_latents = []
+        self.saved_motion_gt_latents = []
+    def _compute(
+        self,
+        motion_emb=None,
+        text_emb=None,
+        gt_motion_emb=None,
+        **kwargs,
+    ) -> Dict:
+        if motion_emb is None or text_emb is None:
+            return {}
+        motion_emb = np.asarray(motion_emb)
+        text_emb = np.asarray(text_emb)
+        if motion_emb.ndim == 1:
+            motion_emb = motion_emb[np.newaxis, :]
+        if text_emb.ndim == 1:
+            text_emb = text_emb[np.newaxis, :]
+        self.saved_text_latents.append(text_emb)
+        self.saved_motion_gen_latents.append(motion_emb)
+        if gt_motion_emb is not None:
+            gt_motion_emb = np.asarray(gt_motion_emb)
+            if gt_motion_emb.ndim == 1:
+                gt_motion_emb = gt_motion_emb[np.newaxis, :]
+            self.saved_motion_gt_latents.append(gt_motion_emb)
+        scores = get_scores_unit(motion_emb, text_emb)
+        return {"TMR/t2m_sim": torch.tensor(scores, dtype=torch.float32)}
+    def aggregate(self):
+        output = {}
+        for key, lst in self.saved_metrics.items():
+            output[key] = np.concatenate(lst)
+        if not self.saved_text_latents:
+            return output
+        text_latents = np.concatenate(self.saved_text_latents)
+        motion_gen_latents = np.concatenate(self.saved_motion_gen_latents)
+        batch_size = len(text_latents)
+        assert text_latents.shape == motion_gen_latents.shape
+        scores_t2m = get_score_matrix_unit(text_latents, motion_gen_latents)
+        scores_t2t = get_score_matrix_unit(text_latents, text_latents)
+        t2m_metrics = contrastive_metrics(
+            scores=scores_t2m,
+            scores_t2t=scores_t2t,
+            threshold=0.99,
+            rounding=self.ranks_rounding,
+        )
+        for key, val in t2m_metrics.items():
+            output["TMR/t2m_R/" + key] = val
+        mu_gen, cov_gen = calculate_activation_statistics(motion_gen_latents)
+        mu_text, cov_text = calculate_activation_statistics(text_latents)
+        output["TMR/FID/gen_text"] = calculate_frechet_distance(mu_gen, cov_gen, mu_text, cov_text)
+        if self.saved_motion_gt_latents:
+            motion_gt_latents = np.concatenate(self.saved_motion_gt_latents)
+            assert motion_gt_latents.shape == motion_gen_latents.shape
+            scores_m2gm = get_score_matrix_unit(motion_gen_latents, motion_gt_latents)
+            scores_t2gm = get_score_matrix_unit(text_latents, motion_gt_latents)
+            m2gm_metrics = contrastive_metrics(
+                scores=scores_m2gm,
+                scores_t2t=scores_t2t,
+                threshold=0.99,
+                rounding=self.ranks_rounding,
+            )
+            for key, val in m2gm_metrics.items():
+                output["TMR/m2m_R/" + key] = val
+            t2gm_metrics = contrastive_metrics(
+                scores=scores_t2gm,
+                scores_t2t=scores_t2t,
+                threshold=0.99,
+                rounding=self.ranks_rounding,
+            )
+            for key, val in t2gm_metrics.items():
+                output["TMR/t2m_gt_R/" + key] = val
+            mu_gt_motion, cov_gt_motion = calculate_activation_statistics(motion_gt_latents)
+            output["TMR/FID/gen_gt"] = calculate_frechet_distance(mu_gen, cov_gen, mu_gt_motion, cov_gt_motion)
+            output["TMR/FID/gt_text"] = calculate_frechet_distance(mu_gt_motion, cov_gt_motion, mu_text, cov_text)
+        for key, val in output.items():
+            if isinstance(val, (int, float, np.integer, np.floating)):
+                val = torch.tensor([val for _ in range(batch_size)])
+            if isinstance(val, np.ndarray):
+                val = torch.from_numpy(val)
+            output[key] = val.cpu().float()
+        return output
+def compute_tmr_retrieval_metrics(
+    motion_emb: np.ndarray,
+    text_emb: np.ndarray,
+    gt_motion_emb: Optional[np.ndarray] = None,
+    rounding: int = 2,
+) -> Dict[str, float]:
+    """Compute TMR retrieval metrics from precomputed embeddings."""
+    if motion_emb.shape != text_emb.shape:
+        raise ValueError(f"Expected same shape for motion/text embeddings, got {motion_emb.shape} vs {text_emb.shape}")
+    scores_t2m = get_score_matrix_unit(text_emb, motion_emb)
+    scores_t2t = get_score_matrix_unit(text_emb, text_emb)
+    output: Dict[str, float] = {}
+    t2m_metrics = contrastive_metrics(
+        scores=scores_t2m,
+        scores_t2t=scores_t2t,
+        threshold=0.99,
+        rounding=rounding,
+    )
+    for key, val in t2m_metrics.items():
+        output[f"TMR/t2m_R/{key}"] = float(val)
+    mu_gen, cov_gen = calculate_activation_statistics(motion_emb)
+    mu_text, cov_text = calculate_activation_statistics(text_emb)
+    output["TMR/FID/gen_text"] = float(calculate_frechet_distance(mu_gen, cov_gen, mu_text, cov_text))
+    if gt_motion_emb is not None:
+        if gt_motion_emb.shape != motion_emb.shape:
+            raise ValueError(f"Expected gt motion embeddings shape {motion_emb.shape}, got {gt_motion_emb.shape}")
+        scores_m2gm = get_score_matrix_unit(motion_emb, gt_motion_emb)
+        scores_t2gm = get_score_matrix_unit(text_emb, gt_motion_emb)
+        m2gm_metrics = contrastive_metrics(
+            scores=scores_m2gm,
+            scores_t2t=scores_t2t,
+            threshold=0.99,
+            rounding=rounding,
+        )
+        for key, val in m2gm_metrics.items():
+            output[f"TMR/m2m_R/{key}"] = float(val)
+        t2gm_metrics = contrastive_metrics(
+            scores=scores_t2gm,
+            scores_t2t=scores_t2t,
+            threshold=0.99,
+            rounding=rounding,
+        )
+        for key, val in t2gm_metrics.items():
+            output[f"TMR/t2m_gt_R/{key}"] = float(val)
+        mu_gt_motion, cov_gt_motion = calculate_activation_statistics(gt_motion_emb)
+        output["TMR/FID/gen_gt"] = float(calculate_frechet_distance(mu_gen, cov_gen, mu_gt_motion, cov_gt_motion))
+        output["TMR/FID/gt_text"] = float(calculate_frechet_distance(mu_gt_motion, cov_gt_motion, mu_text, cov_text))
+    return output
+def all_contrastive_metrics(sims, emb=None, threshold=None, rounding=2, return_cols=False):
+    text_selfsim = None
+    if emb is not None:
+        text_selfsim = emb @ emb.T
+    t2m_m, t2m_cols = contrastive_metrics(sims, text_selfsim, threshold, return_cols=True, rounding=rounding)
+    m2t_m, m2t_cols = contrastive_metrics(sims.T, text_selfsim, threshold, return_cols=True, rounding=rounding)
+    all_m = {}
+    for key in t2m_m:
+        all_m[f"t2m/{key}"] = t2m_m[key]
+        all_m[f"m2t/{key}"] = m2t_m[key]
+    all_m["t2m/len"] = float(len(sims))
+    all_m["m2t/len"] = float(len(sims[0]))
+    if return_cols:
+        return all_m, t2m_cols, m2t_cols
+    return all_m
+def contrastive_metrics(
+    scores,
+    scores_t2t=None,
+    threshold=None,
+    rounding=2,
+):
+    n, m = scores.shape
+    assert n == m
+    num_queries = n
+    dists = -scores
+    sorted_dists = np.sort(dists, axis=1)
+    # GT is in the diagonal
+    gt_dists = np.diag(dists)[:, None]
+    if scores_t2t is not None and threshold is not None:
+        real_threshold = 2 * threshold - 1
+        idx = np.argwhere(scores_t2t > real_threshold)
+        partition = np.unique(idx[:, 0], return_index=True)[1]
+        # take as GT the minimum score of similar values
+        gt_dists = np.minimum.reduceat(dists[tuple(idx.T)], partition)
+        gt_dists = gt_dists[:, None]
+    rows, cols = np.where((sorted_dists - gt_dists) == 0)  # find column position of GT
+    # if there are ties
+    if rows.size > num_queries:
+        assert np.unique(rows).size == num_queries, "issue in metric evaluation"
+        avg_cols = break_ties_average(sorted_dists, gt_dists)
+        cols = avg_cols
+    msg = "expected ranks to match queries ({} vs {}) "
+    assert cols.size == num_queries, msg
+    metrics = {}
+    vals = [str(x).zfill(2) for x in [1, 2, 3, 5, 10]]
+    for val in vals:
+        metrics[f"R{val}"] = 100 * float(np.sum(cols < int(val))) / num_queries
+    metrics["MedR"] = float(np.median(cols) + 1)
+    metrics["len"] = num_queries
+    if rounding is not None:
+        for key in metrics:
+            metrics[key] = round(metrics[key], rounding)
+    return metrics
+def break_ties_average(sorted_dists, gt_dists):
+    # fast implementation, based on this code:
+    # https://stackoverflow.com/a/49239335
+    locs = np.argwhere((sorted_dists - gt_dists) == 0)
+    # Find the split indices
+    steps = np.diff(locs[:, 0])
+    splits = np.nonzero(steps)[0] + 1
+    splits = np.insert(splits, 0, 0)
+    # Compute the result columns
+    summed_cols = np.add.reduceat(locs[:, 1], splits)
+    counts = np.diff(np.append(splits, locs.shape[0]))
+    avg_cols = summed_cols / counts
+    return avg_cols
+def calculate_activation_statistics(activations):
+    """
+    Params:
+    -- activation: num_samples x dim_feat
+    Returns:
+    -- mu: dim_feat
+    -- sigma: dim_feat x dim_feat
+    """
+    mu = np.mean(activations, axis=0)
+    cov = np.cov(activations, rowvar=False)
+    return mu, cov
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    """Numpy implementation of the Frechet Distance. The Frechet distance between two multivariate
+    Gaussians X_1 ~ N(mu_1, C_1)
+    and X_2 ~ N(mu_2, C_2) is
+            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+    Stable version by Dougal J. Sutherland.
+    Params:
+    -- mu1   : Numpy array containing the activations of a layer of the
+               inception net (like returned by the function 'get_predictions')
+               for generated samples.
+    -- mu2   : The sample mean over activations, precalculated on an
+               representative dataset set.
+    -- sigma1: The covariance matrix over activations for generated samples.
+    -- sigma2: The covariance matrix over activations, precalculated on an
+               representative dataset set.
+    Returns:
+    --   : The Frechet Distance.
+    """
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+    assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths"
+    assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions"
+    diff = mu1 - mu2
+    # Product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = ("fid calculation produces singular product; " "adding %s to diagonal of cov estimates") % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            # try again with diagonal %s
+            offset = np.eye(sigma1.shape[0]) * eps
+            covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+            if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+                m = np.max(np.abs(covmean.imag))
+                raise ValueError("Imaginary component {}".format(m))
+        covmean = covmean.real
+    tr_covmean = np.trace(covmean)
+    return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean

kimodo/model/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Kimodo model package: main model class, text encoders, and loading utilities."""
+from .common import resolve_target
+from .kimodo_model import Kimodo
+from .llm2vec import LLM2VecEncoder
+from .load_model import load_model
+from .loading import (
+    AVAILABLE_MODELS,
+    DEFAULT_MODEL,
+    DEFAULT_TEXT_ENCODER_URL,
+    MODEL_NAMES,
+    load_checkpoint_state_dict,
+)
+from .tmr import TMR
+from .twostage_denoiser import TwostageDenoiser
+__all__ = [
+    "Kimodo",
+    "LLM2VecEncoder",
+    "TMR",
+    "TwostageDenoiser",
+    "load_model",
+    "load_checkpoint_state_dict",
+    "resolve_target",
+    "AVAILABLE_MODELS",
+    "DEFAULT_MODEL",
+    "DEFAULT_TEXT_ENCODER_URL",
+    "MODEL_NAMES",
+]

kimodo/model/backbone.py ADDED Viewed

	@@ -0,0 +1,312 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Transformer backbone: padding, masking, and encoder stack for the denoiser."""
+import logging
+from typing import Optional, Union
+import torch
+from omegaconf import ListConfig
+from pydantic.dataclasses import dataclass
+from torch import Tensor, nn
+from torch.nn import TransformerEncoder, TransformerEncoderLayer
+from kimodo.tools import validate
+log = logging.getLogger(__name__)
+def pad_x_and_mask_to_fixed_size(x: Tensor, mask: Tensor, size: int):
+    """Pad a feature vector x and the mask to always have the same size.
+    Args:
+        x (torch.Tensor): [B, T, D]
+        mask (torch.Tensor): [B, T]
+        size (int)
+    Returns:
+        torch.Tensor: [B, size, D]
+        torch.Tensor: [B, size]
+    """
+    batch_size, cur_max_size, dim = x.shape[0], x.shape[1], x.shape[2]
+    if cur_max_size == size:
+        # already padded to this size, probably in the collate function
+        return x, mask
+    if cur_max_size > size:
+        # This issue should have been handled in the collate function
+        # usefull as a check for test time
+        log.warn("The size of the tensor is larger than the maximum size. Cropping the input..")
+        cur_max_size = size
+    new_x = torch.zeros(
+        (batch_size, size, dim),
+        dtype=x.dtype,
+        device=x.device,
+    )
+    new_x[:, :cur_max_size] = x
+    # same for the mask
+    new_mask = torch.zeros(
+        (batch_size, size),
+        dtype=mask.dtype,
+        device=mask.device,
+    )
+    new_mask[:, :cur_max_size] = mask
+    return new_x, new_mask
+@dataclass(frozen=True, config=dict(extra="forbid", arbitrary_types_allowed=True))
+class TransformerEncoderBlockConfig:
+    """Configuration for the transformer encoder backbone."""
+    # input features dimension
+    input_dim: int
+    # output features dimension
+    output_dim: int
+    # skeleton object
+    skeleton: object
+    # dimension of the text embeddings
+    llm_shape: Union[list[int], ListConfig]
+    # mask the text or not
+    use_text_mask: bool
+    # latent dimension of the model
+    latent_dim: int
+    # dimension of the feedforward network in transformer
+    ff_size: int
+    # num layers in transformer
+    num_layers: int
+    # num heads in transformer
+    num_heads: int
+    # activation in transformer
+    activation: str
+    # dropout rate for the transformer
+    dropout: float
+    # dropout rate for the positional embeddings
+    pe_dropout: float
+    # use norm first or not
+    norm_first: bool = False
+    # artificially extend the number of text tokens
+    num_text_tokens_override: Optional[int] = None
+    # Input first heading angle
+    input_first_heading_angle: bool = False
+class TransformerEncoderBlock(nn.Module):
+    @validate(TransformerEncoderBlockConfig, save_args=True, super_init=True)
+    def __init__(self, conf):
+        self.nbjoints = self.skeleton.nbjoints
+        llm_dim = self.llm_shape[-1]
+        self.embed_text = nn.Linear(llm_dim, self.latent_dim)
+        self.sequence_pos_encoder = PositionalEncoding(self.latent_dim, self.pe_dropout)
+        # maximum number of tokens
+        self.num_text_tokens = self.llm_shape[0]
+        if self.num_text_tokens_override is not None:
+            self.num_text_tokens = self.num_text_tokens_override
+        self.embed_timestep = TimestepEmbedder(self.latent_dim, self.sequence_pos_encoder)
+        self.input_linear = nn.Linear(self.input_dim, self.latent_dim)
+        self.output_linear = nn.Linear(self.latent_dim, self.output_dim)
+        self.linear_first_heading_angle = nn.Linear(2, self.latent_dim)
+        trans_enc_layer = TransformerEncoderLayer(
+            d_model=self.latent_dim,
+            nhead=self.num_heads,
+            dim_feedforward=self.ff_size,
+            dropout=self.dropout,
+            activation=self.activation,
+            batch_first=True,
+            norm_first=self.norm_first,
+        )
+        self.seqTransEncoder = TransformerEncoder(
+            trans_enc_layer,
+            num_layers=self.num_layers,
+            enable_nested_tensor=False,
+        )
+    def forward(
+        self,
+        x: Tensor,
+        x_pad_mask: torch.Tensor,
+        text_feat: torch.Tensor,
+        text_feat_pad_mask: torch.Tensor,
+        timesteps: Tensor,
+        first_heading_angle: Optional[Tensor] = None,
+    ) -> Tensor:
+        """
+        Args:
+            x (torch.Tensor): [B, T, dim_motion] current noisy motion
+            x_pad_mask (torch.Tensor): [B, T] attention mask, positions with True are allowed to attend, False are not
+            text_feat (torch.Tensor): [B, max_text_len, llm_dim] embedded text prompts
+            text_feat_pad_mask (torch.Tensor): [B, max_text_len] attention mask, positions with True are allowed to attend, False are not
+            timesteps (torch.Tensor): [B,] current denoising step
+        Returns:
+            torch.Tensor: [B, T, output_dim]
+        """
+        batch_size = len(x)
+        x = self.input_linear(x)  # [B, T, D]
+        # Pad the text tokens + mask to always have the same size == self.num_text_tokens
+        # done here if it was not done in the collate function
+        if self.num_text_tokens is not None:
+            text_feat, text_feat_pad_mask = pad_x_and_mask_to_fixed_size(
+                text_feat,
+                text_feat_pad_mask,
+                self.num_text_tokens,
+            )
+        # Encode the text features and the time information
+        emb_text = self.embed_text(text_feat)  # [B, max_text_len, D]
+        emb_time = self.embed_timestep(timesteps)  # [B, 1, D]
+        # Create mask for the time information
+        time_mask = torch.ones((batch_size, 1), dtype=bool, device=x.device)
+        # Create the prefix features (text, time, etc): [B, max_text_len + 1 + etc]
+        prefix_feats = torch.cat((emb_text, emb_time), axis=1)
+        # Behavior from old code: not use text mask -> True for all the tokens
+        if not self.use_text_mask:
+            text_feat_pad_mask = torch.ones(
+                (batch_size, emb_text.shape[1]),
+                dtype=torch.bool,
+                device=x.device,
+            )
+        prefix_mask = torch.cat((text_feat_pad_mask, time_mask), axis=1)
+        # add the input first heading angle
+        if self.input_first_heading_angle:
+            assert first_heading_angle is not None, "The first heading angle is mandatory for this model"
+            # cos(angle) / sin(angle)
+            first_heading_angle_feats = torch.stack(
+                [
+                    torch.cos(first_heading_angle),
+                    torch.sin(first_heading_angle),
+                ],
+                axis=-1,
+            )
+            first_heading_angle_feats = self.linear_first_heading_angle(first_heading_angle_feats)
+            first_heading_angle_feats = first_heading_angle_feats[:, None]  # for cat
+            first_heading_angle_mask = torch.ones(
+                (batch_size, 1),
+                dtype=bool,
+                device=x.device,
+            )
+            prefix_feats = torch.cat((prefix_feats, first_heading_angle_feats), axis=1)
+            prefix_mask = torch.cat((prefix_mask, first_heading_angle_mask), axis=1)
+        # compute the number of prefix features
+        pose_start_ind = prefix_feats.shape[1]
+        # Concatenate prefix and x: [B, len(prefix) + T, D]
+        xseq = torch.cat((prefix_feats, x), axis=1)
+        # Concatenate the masks and negate them: [B, len(prefix) + T]
+        src_key_padding_mask = ~torch.cat((prefix_mask, x_pad_mask), axis=1)
+        # Add positional encoding
+        xseq = self.sequence_pos_encoder(xseq)
+        # Input to the transformer and keep the motion indexes
+        if isinstance(self.seqTransEncoder, nn.TransformerEncoder):
+            assert not self.seqTransEncoder.use_nested_tensor, "Flash attention should be disabled due to bug!"
+        output = self.seqTransEncoder(
+            xseq,
+            src_key_padding_mask=src_key_padding_mask,
+        )
+        output = output[:, pose_start_ind:]  # [B, T, D]
+        output = self.output_linear(output)  # [B, T, OD]
+        return output
+class PositionalEncoding(nn.Module):
+    """Non-learned positional encoding."""
+    def __init__(
+        self,
+        d_model: int,
+        dropout: Optional[float] = 0.1,
+        max_len: Optional[int] = 5000,
+    ):
+        """
+        Args:
+            d_model (int): input dim
+            dropout (Optional[float] = 0.1): dropout probability on output
+            max_len (Optional[int] = 5000): maximum sequence length
+        """
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        # Note: have to replace torch.exp() and math.log() with torch.pow()
+        # due to MKL exp() and ln() throws floating point exceptions on certain CPUs
+        # see corresponding commit and MR
+        div_term = torch.pow(10000.0, -torch.arange(0, d_model, 2).float() / d_model)
+        # div_term = torch.exp(
+        #     torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)
+        # )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)  # [1, T, D]
+        self.register_buffer("pe", pe, persistent=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply positional encoding to input sequence.
+        Args:
+            x (torch.Tensor): [B, T, D] input motion sequence
+        Returns:
+            torch.Tensor: [B, T, D] input motion with PE added to it (and optionally dropout)
+        """
+        x = x + self.pe[:, : x.shape[1], :]
+        return self.dropout(x)
+class TimestepEmbedder(nn.Module):
+    """Encoder for diffusion step."""
+    def __init__(self, latent_dim: int, sequence_pos_encoder: PositionalEncoding):
+        """
+        Args:
+            latent_dim (int): dim to encode to
+            sequence_pos_encoder (PositionalEncoding): the PE to use on timesteps
+        """
+        super().__init__()
+        self.latent_dim = latent_dim
+        self.sequence_pos_encoder = sequence_pos_encoder
+        time_embed_dim = self.latent_dim
+        self.time_embed = nn.Sequential(
+            nn.Linear(self.latent_dim, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim),
+        )
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        """Embed timesteps by adding PE then going through linear layers.
+        Args:
+            timesteps (torch.Tensor): [B]
+        Returns:
+            torch.Tensor: [B, 1, D]
+        """
+        return self.time_embed(self.sequence_pos_encoder.pe.transpose(0, 1)[timesteps])

kimodo/model/cfg.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Classifier-free guidance wrapper for the denoiser at sampling time."""
+from typing import Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+CFG_TYPES = ["nocfg", "regular", "separated"]
+class ClassifierFreeGuidedModel(nn.Module):
+    """Wrapper around denoiser to use classifier-free guidance at sampling time."""
+    def __init__(self, model: nn.Module, cfg_type: Optional[str] = "separated"):
+        """Wrap the denoiser for classifier-free guidance; cfg_type in CFG_TYPES (e.g. 'regular',
+        'nocfg')."""
+        super().__init__()
+        self.model = model
+        assert cfg_type in CFG_TYPES, f"Invalid cfg_type: {cfg_type}"
+        self.cfg_type_default = cfg_type
+    def forward(
+        self,
+        cfg_weight: Union[float, Tuple[float, float]],
+        x: torch.Tensor,
+        x_pad_mask: torch.Tensor,
+        text_feat: torch.Tensor,
+        text_feat_pad_mask: torch.Tensor,
+        timesteps: torch.Tensor,
+        first_heading_angle: Optional[torch.Tensor] = None,
+        motion_mask: Optional[torch.Tensor] = None,
+        observed_motion: Optional[torch.Tensor] = None,
+        cfg_type: Optional[str] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            cfg_weight (float): guidance weight float or tuple of floats with (text, constraint) weights if using separated cfg
+            x (torch.Tensor): [B, T, dim_motion] current noisy motion
+            x_pad_mask (torch.Tensor): [B, T] attention mask, positions with True are allowed to attend, False are not
+            text_feat (torch.Tensor): [B, max_text_len, llm_dim] embedded text prompts
+            text_feat_pad_mask (torch.Tensor): [B, max_text_len] attention mask, positions with True are allowed to attend, False are not
+            timesteps (torch.Tensor): [B,] current denoising step
+            motion_mask
+            observed_motion
+            neutral_joints (torch.Tensor): [B, nbjoints] The neutral joints of the motions
+        Returns:
+            torch.Tensor: same size as input x
+        """
+        if cfg_type is None:
+            cfg_type = self.cfg_type_default
+        assert cfg_type in CFG_TYPES, f"Invalid cfg_type: {cfg_type}"
+        # batched conditional and uncond pass together
+        if cfg_type == "nocfg":
+            return self.model(
+                x,
+                x_pad_mask,
+                text_feat,
+                text_feat_pad_mask,
+                timesteps,
+                first_heading_angle=first_heading_angle,
+                motion_mask=motion_mask,
+                observed_motion=observed_motion,
+            )
+        elif cfg_type == "regular":
+            assert isinstance(cfg_weight, (float, int)), "cfg_weight must be a single float for regular CFG"
+            # out_uncond + w * (out_text_and_constraint - out_uncond)
+            text_feat = torch.concatenate([text_feat, 0 * text_feat], dim=0)
+            if motion_mask is not None:
+                motion_mask = torch.concatenate([motion_mask, 0 * motion_mask], dim=0)
+            if observed_motion is not None:
+                observed_motion = torch.concatenate([observed_motion, observed_motion], dim=0)
+            if first_heading_angle is not None:
+                first_heading_angle = torch.concatenate([first_heading_angle, first_heading_angle], dim=0)
+            out_cond_uncond = self.model(
+                torch.concatenate([x, x], dim=0),
+                torch.concatenate([x_pad_mask, x_pad_mask], dim=0),
+                text_feat,
+                torch.concatenate([text_feat_pad_mask, False * text_feat_pad_mask], dim=0),
+                torch.concatenate([timesteps, timesteps], dim=0),
+                first_heading_angle=first_heading_angle,
+                motion_mask=motion_mask,
+                observed_motion=observed_motion,
+            )
+            out, out_uncond = torch.chunk(out_cond_uncond, 2)
+            out_new = out_uncond + (cfg_weight * (out - out_uncond))
+        elif cfg_type == "separated":
+            assert len(cfg_weight) == 2, "cfg_weight must be a tuple of two floats for separated CFG"
+            # out_uncond + w_text * (out_text - out_uncond) + w_constraint * (out_constraint - out_uncond)
+            text_feat = torch.concatenate([text_feat, 0 * text_feat, 0 * text_feat], dim=0)
+            if motion_mask is not None:
+                motion_mask = torch.concatenate([0 * motion_mask, motion_mask, 0 * motion_mask], dim=0)
+            if observed_motion is not None:
+                observed_motion = torch.concatenate([observed_motion, observed_motion, observed_motion], dim=0)
+            if first_heading_angle is not None:
+                first_heading_angle = torch.concatenate(
+                    [first_heading_angle, first_heading_angle, first_heading_angle],
+                    dim=0,
+                )
+            out_cond_uncond = self.model(
+                torch.concatenate([x, x, x], dim=0),
+                torch.concatenate([x_pad_mask, x_pad_mask, x_pad_mask], dim=0),
+                text_feat,
+                torch.concatenate(
+                    [
+                        text_feat_pad_mask,
+                        False * text_feat_pad_mask,
+                        False * text_feat_pad_mask,
+                    ],
+                    dim=0,
+                ),
+                torch.concatenate([timesteps, timesteps, timesteps], dim=0),
+                first_heading_angle=first_heading_angle,
+                motion_mask=motion_mask,
+                observed_motion=observed_motion,
+            )
+            out_text, out_constraint, out_uncond = torch.chunk(out_cond_uncond, 3)
+            out_new = (
+                out_uncond + (cfg_weight[0] * (out_text - out_uncond)) + (cfg_weight[1] * (out_constraint - out_uncond))
+            )
+        else:
+            raise ValueError(f"Invalid cfg_type: {cfg_type}")
+        return out_new

kimodo/model/common.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Config hydration: env vars, _target_ resolution, and recursive instantiation."""
+import importlib
+import os
+def get_env_var(name: str, default=None):
+    """Read env var by name and by lowercased name; return default if neither set."""
+    return os.getenv(name, os.getenv(name.lower(), default))
+def resolve_target(target: str):
+    """Import module and return the attribute named by a dotted path (e.g. 'pkg.mod.Class')."""
+    module_name, attr_name = target.rsplit(".", 1)
+    module = importlib.import_module(module_name)
+    return getattr(module, attr_name)
+def materialize_value(value):
+    """Recursively turn dicts with '_target_' into instances; lists/dicts traversed; leaves
+    unchanged."""
+    if isinstance(value, dict):
+        if "_target_" in value:
+            return instantiate_from_dict(value)
+        return {k: materialize_value(v) for k, v in value.items()}
+    if isinstance(value, list):
+        return [materialize_value(v) for v in value]
+    return value
+def instantiate_from_dict(node, overrides=None):
+    """Build an instance from a config dict: '_target_' gives the class, other keys are kwargs; overrides merged in."""
+    if not isinstance(node, dict) or "_target_" not in node:
+        raise ValueError("Config node must be a dict with a '_target_' key.")
+    target = resolve_target(node["_target_"])
+    kwargs = {}
+    for key, value in node.items():
+        if key == "_target_":
+            continue
+        kwargs[key] = materialize_value(value)
+    if overrides:
+        kwargs.update({k: v for k, v in overrides.items() if v is not None})
+    return target(**kwargs)

kimodo/model/diffusion.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Diffusion process and DDIM sampling for motion generation."""
+import math
+from typing import Optional, Tuple
+import torch
+from torch import nn
+def get_beta_schedule(
+    num_diffusion_timesteps: int,
+    max_beta: Optional[float] = 0.999,
+) -> torch.Tensor:
+    """Get cosine beta schedule."""
+    def alpha_bar(t):
+        return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float)
+class Diffusion(torch.nn.Module):
+    """Cosine-schedule diffusion process: betas, alphas, and DDIM step mapping."""
+    def __init__(self, num_base_steps: int):
+        """Set up cosine beta schedule and precompute diffusion variables for num_base_steps."""
+        super().__init__()
+        self.num_base_steps = num_base_steps
+        betas_base = get_beta_schedule(self.num_base_steps)
+        self.register_buffer("betas_base", betas_base, persistent=False)
+        alphas_cumprod_base = torch.cumprod(1.0 - self.betas_base, dim=0)
+        self.register_buffer("alphas_cumprod_base", alphas_cumprod_base, persistent=False)
+        use_timesteps, _ = self.space_timesteps(self.num_base_steps)
+        self.calc_diffusion_vars(use_timesteps)
+    def extra_repr(self) -> str:
+        return f"num_base_steps={self.num_base_steps}"
+    @property
+    def device(self):
+        return self.betas_base.device
+    def space_timesteps(self, num_denoising_steps: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Return (use_timesteps, map_tensor) for a subsampled denoising schedule of
+        num_denoising_steps."""
+        nsteps_train = self.num_base_steps
+        frac_stride = (nsteps_train - 1) / max(1, num_denoising_steps - 1)
+        use_timesteps = torch.round(torch.arange(nsteps_train, device=self.device) * frac_stride).to(torch.long)
+        use_timesteps = torch.clamp(use_timesteps, max=nsteps_train - 1)
+        map_tensor = torch.arange(nsteps_train, device=self.device, dtype=torch.long)[use_timesteps]
+        return use_timesteps, map_tensor
+    def calc_diffusion_vars(self, use_timesteps: torch.Tensor) -> None:
+        """Update buffers (betas, alphas, alphas_cumprod, etc.) for the given subsampled
+        timesteps."""
+        alphas_cumprod = self.alphas_cumprod_base[use_timesteps]
+        last_alpha_cumprod = torch.cat([torch.tensor([1.0]).to(alphas_cumprod), alphas_cumprod[:-1]])
+        betas = 1.0 - alphas_cumprod / last_alpha_cumprod
+        self.register_buffer("betas", betas, persistent=False)
+        alphas = 1.0 - self.betas
+        self.register_buffer("alphas", alphas, persistent=False)
+        alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        alphas_cumprod = torch.clamp(alphas_cumprod, min=1e-9)
+        self.register_buffer("alphas_cumprod", alphas_cumprod, persistent=False)
+        alphas_cumprod_prev = torch.cat([torch.tensor([1.0]).to(self.alphas_cumprod), self.alphas_cumprod[:-1]])
+        self.register_buffer("alphas_cumprod_prev", alphas_cumprod_prev, persistent=False)
+        sqrt_recip_alphas_cumprod = torch.rsqrt(self.alphas_cumprod)
+        self.register_buffer("sqrt_recip_alphas_cumprod", sqrt_recip_alphas_cumprod, persistent=False)
+        sqrt_recipm1_alphas_cumprod = torch.rsqrt(self.alphas_cumprod / (1.0 - self.alphas_cumprod))
+        self.register_buffer("sqrt_recipm1_alphas_cumprod", sqrt_recipm1_alphas_cumprod, persistent=False)
+        posterior_variance = self.betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        self.register_buffer("posterior_variance", posterior_variance, persistent=False)
+        sqrt_alphas_cumprod = torch.rsqrt(1.0 / self.alphas_cumprod)
+        self.register_buffer("sqrt_alphas_cumprod", sqrt_alphas_cumprod, persistent=False)
+        sqrt_one_minus_alphas_cumprod = torch.rsqrt(1.0 / (1.0 - self.alphas_cumprod))
+        self.register_buffer(
+            "sqrt_one_minus_alphas_cumprod",
+            sqrt_one_minus_alphas_cumprod,
+            persistent=False,
+        )
+    def q_sample(
+        self,
+        x_start: torch.Tensor,
+        t: torch.Tensor,
+        noise: torch.Tensor = None,
+    ):
+        if noise is None:
+            noise = torch.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        xt = (
+            self.sqrt_alphas_cumprod[t, None, None] * x_start
+            + self.sqrt_one_minus_alphas_cumprod[t, None, None] * noise
+        )
+        return xt
+class DDIMSampler(nn.Module):
+    """Deterministic DDIM sampler (eta = 0)."""
+    def __init__(self, diffusion: Diffusion):
+        super().__init__()
+        self.diffusion = diffusion
+    def __call__(
+        self,
+        use_timesteps: torch.Tensor,
+        x_t: torch.Tensor,
+        pred_xstart: torch.Tensor,
+        t: torch.Tensor,
+    ) -> torch.Tensor:
+        self.diffusion.calc_diffusion_vars(use_timesteps)
+        eps = (
+            self.diffusion.sqrt_recip_alphas_cumprod[t, None, None] * x_t - pred_xstart
+        ) / self.diffusion.sqrt_recipm1_alphas_cumprod[t, None, None]
+        alpha_bar_prev = self.diffusion.alphas_cumprod_prev[t, None, None]
+        x = pred_xstart * torch.sqrt(alpha_bar_prev) + torch.sqrt(1 - alpha_bar_prev) * eps
+        return x

kimodo/model/kimodo_model.py ADDED Viewed

	@@ -0,0 +1,605 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Kimodo model: denoiser, text encoder, diffusion sampling, and post-processing."""
+import logging
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+from torch import nn
+from tqdm.auto import tqdm
+from kimodo.constraints import FullBodyConstraintSet
+from kimodo.motion_rep.feature_utils import compute_heading_angle, length_to_mask
+from kimodo.postprocess import post_process_motion
+from kimodo.sanitize import sanitize_texts
+from kimodo.skeleton import SOMASkeleton30
+from kimodo.tools import to_numpy
+from .cfg import ClassifierFreeGuidedModel
+from .diffusion import DDIMSampler, Diffusion
+log = logging.getLogger(__name__)
+class Kimodo(nn.Module):
+    """Helper class for test time."""
+    def __init__(
+        self,
+        denoiser: nn.Module,
+        text_encoder: nn.Module,
+        num_base_steps: int,
+        device: Optional[Union[str, torch.device]] = None,
+        cfg_type: Optional[str] = "separated",
+    ):
+        super().__init__()
+        self.denoiser = denoiser.eval()
+        if cfg_type is None:
+            cfg_type = "nocfg"
+        # Add Classifier-free guidance to the model if needed
+        self.denoiser = ClassifierFreeGuidedModel(self.denoiser, cfg_type=cfg_type)
+        self.motion_rep = denoiser.motion_rep
+        self.skeleton = self.motion_rep.skeleton
+        self.fps = denoiser.motion_rep.fps
+        self.diffusion = Diffusion(num_base_steps=num_base_steps)
+        self.sampler = DDIMSampler(self.diffusion)
+        self.text_encoder = text_encoder
+        self.device = device
+        # for classifier-free guidance
+        self.to(device)
+    @property
+    def output_skeleton(self):
+        """Skeleton used for model output (somaskel77 for SOMA, else unchanged)."""
+        if isinstance(self.skeleton, SOMASkeleton30):
+            return self.skeleton.somaskel77
+        return self.skeleton
+    def train(self, mode: bool):
+        self.denoiser.train(mode)
+        return self
+    def eval(self):
+        self.denoiser.eval()
+        return self
+    def denoising_step(
+        self,
+        motion: torch.Tensor,
+        pad_mask: torch.Tensor,
+        text_feat: torch.Tensor,
+        text_pad_mask: torch.Tensor,
+        t: torch.Tensor,
+        first_heading_angle: Optional[torch.Tensor],
+        motion_mask: torch.Tensor,
+        observed_motion: torch.Tensor,
+        num_denoising_steps: torch.Tensor,
+        cfg_weight: Union[float, Tuple[float, float]],
+        guide_masks: Optional[Dict] = None,
+        cfg_type: Optional[str] = None,
+    ) -> torch.Tensor:
+        """Single denoising step.
+        Returns:
+            torch.Tensor: [B, T, D] noisy motion input to t-1
+        """
+        # subsample timesteps
+        #   NOTE: do this at every step due to ONNX export, i.e. num_samp_stepsmay change dynamically when
+        #       running onnx version so need to account for that.
+        num_denoising_steps = num_denoising_steps[0]
+        use_timesteps, map_tensor = self.diffusion.space_timesteps(num_denoising_steps)
+        self.diffusion.calc_diffusion_vars(use_timesteps)
+        # first compute initial clean prediction from denoiser
+        t_map = map_tensor[t]
+        with torch.inference_mode():
+            pred_clean = self.denoiser(
+                cfg_weight,
+                motion,
+                pad_mask,
+                text_feat,
+                text_pad_mask,
+                t_map,
+                first_heading_angle,
+                motion_mask,
+                observed_motion,
+                cfg_type=cfg_type,
+            )
+        # sampler computes next step noisy motion
+        x_tm1 = self.sampler(use_timesteps, motion, pred_clean, t)
+        return x_tm1
+    def _multiprompt(
+        self,
+        prompts: list[str],
+        num_frames: int | list[int],
+        num_denoising_steps: int,
+        constraint_lst: Optional[list] = [],
+        cfg_weight: Optional[float] = [2.0, 2.0],
+        num_samples: Optional[int] = None,
+        cfg_type: Optional[str] = None,
+        return_numpy: bool = False,
+        first_heading_angle: Optional[torch.Tensor] = None,
+        # for transitioning
+        num_transition_frames: int = 5,
+        share_transition: bool = True,
+        percentage_transition_override=0.10,
+        # for postprocess
+        post_processing: bool = False,
+        root_margin: float = 0.04,
+        # progress bar
+        progress_bar=tqdm,
+    ) -> torch.Tensor:
+        device = self.device
+        bs = num_samples
+        texts = sanitize_texts(prompts)
+        if isinstance(num_frames, int):
+            # same duration for all the segments
+            num_frames = [num_frames for _ in range(num_samples)]
+        tosqueeze = False
+        if num_samples is None:
+            num_samples = 1
+            tosqueeze = True
+        if constraint_lst is None:
+            constraint_lst = []
+        # Generate one chunck at a time
+        current_frame = 0
+        generated_motions = []
+        for idx, (text, num_frame) in enumerate(zip(texts, num_frames)):
+            texts_bs = [text for _ in range(num_samples)]
+            lengths = torch.tensor(
+                [num_frame for _ in range(num_samples)],
+                device=device,
+            )
+            is_first_motion = not generated_motions
+            observed_motion, motion_mask = None, None
+            # filter the constraint_lst to only keep the relevent ones
+            constraint_lst_base = [
+                constraint.crop_move(current_frame, current_frame + num_frame) for constraint in constraint_lst
+            ]  # this move temporally but not spatially
+            observed_motion, motion_mask = self.motion_rep.create_conditions_from_constraints_batched(
+                constraint_lst_base,
+                lengths,
+                to_normalize=False,  # don't normalize yet, it needs to be moved around
+                device=device,
+            )
+            if not is_first_motion:
+                prev_num_frame = num_frames[idx - 1]
+                if share_transition:
+                    # starting the transitioning earlier, to "share" the transition between A and B
+                    # in any case, we still use "num_transition_frames" for conditioning
+                    # we don't condition until the end of A
+                    # we compute the number of frames of transition as a percentage of the last motion
+                    nb_transition_frames = num_transition_frames + int(prev_num_frame * percentage_transition_override)
+                else:
+                    nb_transition_frames = num_transition_frames
+                latest_motions = generated_motions.pop()
+                # remove the transition part of A (will be put back afterward)
+                generated_motions.append(latest_motions[:, :-nb_transition_frames])
+                latest_frames = latest_motions[:, -nb_transition_frames:]
+                # latest_frames[..., 2] += 0.5
+                last_output = self.motion_rep.inverse(
+                    latest_frames,
+                    is_normalized=False,
+                    return_numpy=False,
+                )
+                smooth_root_2d = last_output["smooth_root_pos"][..., [0, 2]]
+                # add constraints at the begining to allow natural transitions
+                constraint_lst_transition = []
+                for batch_id in range(bs):
+                    new_constraint = FullBodyConstraintSet(
+                        self.skeleton,
+                        torch.arange(num_transition_frames),
+                        last_output["posed_joints"][batch_id, :num_transition_frames],
+                        last_output["local_rot_mats"][batch_id, :num_transition_frames],
+                        smooth_root_2d[batch_id, :num_transition_frames],
+                    )
+                    # new lists
+                    constraint_lst_transition.append([new_constraint])
+                transition_lengths = torch.tensor(
+                    [nb_transition_frames for _ in range(num_samples)],
+                    device=device,
+                )
+                observed_motion_transition, motion_mask_transition = (
+                    self.motion_rep.create_conditions_from_constraints_batched(
+                        constraint_lst_transition,
+                        transition_lengths,
+                        to_normalize=False,  # don't normalize yet
+                        device=device,
+                    )
+                )
+                # concatenate the obversed motion / motion mask
+                observed_motion = torch.cat([observed_motion_transition, observed_motion], axis=1)
+                motion_mask = torch.cat([motion_mask_transition, motion_mask], axis=1)
+                # we need to move each observed motion in the batch to the new starting points
+                last_smooth_root_2d = smooth_root_2d[:, 0]
+                observed_motion = self.motion_rep.translate_2d(
+                    observed_motion, -last_smooth_root_2d
+                )  # equivalent to:  self.motion_rep.translate_2d_to_zero(observed_motion)
+                # remove dummy values after moving
+                observed_motion = observed_motion * motion_mask
+                lengths = lengths + transition_lengths
+                first_heading_angle = compute_heading_angle(last_output["posed_joints"], self.skeleton)[:, 0]
+            else:
+                if first_heading_angle is None:
+                    # Start at 0 angle, but this will change afterward
+                    first_heading_angle = torch.tensor([0.0] * bs, device=device)
+                else:
+                    first_heading_angle = torch.as_tensor(first_heading_angle, device=device)
+                    if first_heading_angle.numel() == 1:
+                        first_heading_angle = first_heading_angle.repeat(bs)
+            observed_motion = self.motion_rep.normalize(observed_motion)
+            max_frames = max(lengths)
+            motion_pad_mask = length_to_mask(lengths)
+            motion = self._generate(
+                texts_bs,
+                max_frames,
+                num_denoising_steps=num_denoising_steps,
+                pad_mask=motion_pad_mask,
+                first_heading_angle=first_heading_angle,
+                motion_mask=motion_mask,
+                observed_motion=observed_motion,
+                cfg_weight=cfg_weight,
+                cfg_type=cfg_type,
+            )
+            motion = self.motion_rep.unnormalize(motion)
+            if not is_first_motion:
+                motion_with_transition = self.motion_rep.translate_2d(
+                    motion,
+                    last_smooth_root_2d,
+                )
+                motion = motion_with_transition[:, num_transition_frames:]
+                transition_frames = motion_with_transition[:, :num_transition_frames]
+                # for sharing = True, the new motion contains the very last of A
+                # linearly combine the previously generated transitions with the newly generated ones
+                # so that we linearly go from previous gen to new gen
+                alpha = torch.linspace(1, 0, num_transition_frames, device=device)[:, None]
+                new_transition_frames = (
+                    latest_frames[:, :num_transition_frames] * alpha + (1 - alpha) * transition_frames
+                )
+                # add new transitions frames for A (merging with B predition of the history)
+                # for share_transition == True, this remove (do not add back) a small part of the end of A
+                # the small last part of A has been re-generated by B
+                generated_motions.append(new_transition_frames)
+                # motion[..., 2] += 0.5
+            generated_motions.append(motion)
+            current_frame += num_frame
+        generated_motions = torch.cat(generated_motions, axis=1)  # temporal axis (b, t, d)
+        if tosqueeze:
+            generated_motions = generated_motions[0]
+        output = self.motion_rep.inverse(
+            generated_motions,
+            is_normalized=False,
+            return_numpy=False,
+        )
+        # Apply post-processing if requested
+        if post_processing:
+            corrected = post_process_motion(
+                output["local_rot_mats"],
+                output["root_positions"],
+                output["foot_contacts"],
+                self.skeleton,
+                constraint_lst,
+                root_margin=root_margin,
+            )
+            output.update(corrected)
+        # Convert SOMA output to somaskel77 for external API
+        if isinstance(self.skeleton, SOMASkeleton30):
+            output = self.skeleton.output_to_SOMASkeleton77(output)
+        # Convert to numpy if requested
+        if return_numpy:
+            output = to_numpy(output)
+        return output
+    def __call__(
+        self,
+        prompts: str | list[str],
+        num_frames: int | list[int],
+        num_denoising_steps: int,
+        multi_prompt: bool = False,
+        constraint_lst: Optional[list] = [],
+        cfg_weight: Optional[float] = [2.0, 2.0],
+        num_samples: Optional[int] = None,
+        cfg_type: Optional[str] = None,
+        return_numpy: bool = False,
+        first_heading_angle: Optional[torch.Tensor] = None,
+        # for transitioning
+        num_transition_frames: int = 5,
+        share_transition: bool = True,
+        percentage_transition_override=0.10,
+        # for postprocess
+        post_processing: bool = False,
+        root_margin: float = 0.04,
+        # progress bar
+        progress_bar=tqdm,
+    ) -> dict:
+        """Generate motion from text prompts and optional kinematic constraints.
+        When a single prompt/num_frames pair is given, one motion is generated.
+        Passing lists of prompts and/or num_frames produces a batch of
+        independent motions. With ``multi_prompt=True``, the prompts are
+        treated as sequential segments that are generated and stitched together
+        with smooth transitions.
+        Args:
+            prompts: One or more text descriptions of the desired motion.
+                A single string generates one sample; a list generates a batch
+                (or sequential segments when ``multi_prompt=True``).
+            num_frames: Duration of the generated motion in frames.  Can be a
+                single int applied to every prompt or a per-prompt list.
+            num_denoising_steps: Number of DDIM denoising steps.  More steps
+                generally improve quality at the cost of speed.
+            multi_prompt: If ``True``, treat ``prompts`` as an ordered sequence
+                of segments and concatenate them with transitions.
+            constraint_lst: Per-sample list of kinematic constraints (e.g.
+                keyframe poses, end-effector targets, 2-D paths).  Pass an
+                empty list for unconstrained generation.
+            cfg_weight: Classifier-free guidance scale(s).  A two-element list
+                ``[text_cfg, constraint_cfg]`` controls text and constraint
+                guidance independently.
+            num_samples: Number of samples to generate.
+            cfg_type: Override the default CFG strategy set at init
+                (e.g. ``"separated"``).
+            return_numpy: If ``True``, convert all output tensors to numpy
+                arrays.
+            first_heading_angle: Initial body heading in radians.  Shape
+                ``(B,)`` or scalar.  Defaults to ``0`` (facing +Z).
+            num_transition_frames: Number of overlapping frames used to blend
+                consecutive segments in multi-prompt mode.
+            share_transition: If ``True``, transition frames are shared between
+                adjacent segments rather than appended.
+            percentage_transition_override: Fraction of each segment's length
+                that may be overridden by the transition blend.
+            post_processing: If ``True``, apply post-processing
+                (foot-skate cleanup and constraint enforcement).
+            root_margin: Horizontal margin (in meters) used by the post-processor
+                to determine when to correct root motion. When root deviates more than
+                margin from the constraint, the post-processor will correct it.
+            progress_bar: Callable wrapping an iterable to display progress
+                (default: ``tqdm``).  Pass a no-op to silence output.
+        Returns:
+            dict: A dictionary of motion tensors (or numpy arrays if
+            ``return_numpy=True``) with the following keys:
+            - ``local_rot_mats`` – Local joint rotations as rotation matrices.
+            - ``global_rot_mats`` – Global joint rotations as rotation matrices.
+            - ``posed_joints`` – Joint positions in world space.
+            - ``root_positions`` – Root joint positions.
+            - ``smooth_root_pos`` – Smoothed root trajectory.
+            - ``foot_contacts`` – Boolean foot-contact labels [left heel, left toe, right heel, right toe].
+            - ``global_root_heading`` – Root heading angle over time.
+        """
+        device = self.device
+        if multi_prompt:
+            # multi prompt generation
+            return self._multiprompt(
+                prompts,
+                num_frames,
+                num_denoising_steps,
+                constraint_lst,
+                cfg_weight,
+                num_samples,
+                cfg_type,
+                return_numpy,
+                first_heading_angle,
+                num_transition_frames,
+                share_transition,
+                percentage_transition_override,
+                post_processing,
+                root_margin,
+                progress_bar,
+            )
+        # Input checking
+        tosqueeze = False
+        if isinstance(prompts, list) and isinstance(num_frames, list):
+            assert len(prompts) == len(num_frames), "The number of prompts should match the number of num_frames."
+            num_samples = len(prompts)
+        elif isinstance(prompts, list):
+            num_samples = len(prompts)
+            num_frames = [num_frames for _ in range(num_samples)]
+        elif isinstance(num_frames, list):
+            num_samples = len(num_frames)
+            prompts = [prompts for _ in range(num_samples)]
+        else:
+            if num_samples is None:
+                tosqueeze = True
+                num_samples = 1
+            prompts = [prompts for _ in range(num_samples)]
+            num_frames = [num_frames for _ in range(num_samples)]
+        bs = num_samples
+        texts = sanitize_texts(prompts)
+        lengths = torch.tensor(
+            num_frames,
+            device=device,
+        )
+        max_frames = max(lengths)
+        motion_pad_mask = length_to_mask(lengths)
+        if first_heading_angle is None:
+            # Start at 0 angle
+            first_heading_angle = torch.tensor([0.0] * bs, device=device)
+        else:
+            first_heading_angle = torch.as_tensor(first_heading_angle, device=device)
+            if first_heading_angle.numel() == 1:
+                first_heading_angle = first_heading_angle.repeat(bs)
+        observed_motion, motion_mask = None, None
+        if constraint_lst:
+            observed_motion, motion_mask = self.motion_rep.create_conditions_from_constraints_batched(
+                constraint_lst,
+                lengths,
+                to_normalize=True,
+                device=device,
+            )
+        motion = self._generate(
+            texts,
+            max_frames,
+            num_denoising_steps=num_denoising_steps,
+            pad_mask=motion_pad_mask,
+            first_heading_angle=first_heading_angle,
+            motion_mask=motion_mask,
+            observed_motion=observed_motion,
+            cfg_weight=cfg_weight,
+            cfg_type=cfg_type,
+            progress_bar=progress_bar,
+        )
+        if tosqueeze:
+            motion = motion[0]
+        output = self.motion_rep.inverse(
+            motion,
+            is_normalized=True,
+            return_numpy=False,  # Keep as tensor for potential post-processing
+        )
+        # Apply post-processing if requested
+        if post_processing:
+            corrected = post_process_motion(
+                output["local_rot_mats"],
+                output["root_positions"],
+                output["foot_contacts"],
+                self.skeleton,
+                constraint_lst,
+                root_margin=root_margin,
+            )
+            # key frame outputs / foot contacts are not changed
+            output.update(corrected)
+        # Convert SOMA output to somaskel77 for external API
+        if isinstance(self.skeleton, SOMASkeleton30):
+            output = self.skeleton.output_to_SOMASkeleton77(output)
+        # Convert to numpy if requested
+        if return_numpy:
+            output = to_numpy(output)
+        return output
+    def _generate(
+        self,
+        texts: List[str],
+        max_frames: int,
+        num_denoising_steps: int,
+        pad_mask: torch.Tensor,
+        first_heading_angle: Optional[torch.Tensor],
+        motion_mask: torch.Tensor,
+        observed_motion: torch.Tensor,
+        cfg_weight: Optional[float] = 2.0,
+        text_feat: Optional[torch.Tensor] = None,
+        text_pad_mask: Optional[torch.Tensor] = None,
+        guide_masks: Optional[Dict] = None,
+        cfg_type: Optional[str] = None,
+        progress_bar=tqdm,
+    ) -> torch.Tensor:
+        """Sample full denoising loop.
+        Args:
+            texts (List[str]): batch of text prompts to use for sampling (if text_feat is not passed in)
+        """
+        device = self.device
+        if text_feat is None:
+            assert text_pad_mask is None
+            log.info("Encoding text...")
+            text_feat, text_length = self.text_encoder(texts)
+            text_feat = text_feat.to(device)
+            # handle empty string (set to zero)
+            empty_text_mask = [len(text.strip()) == 0 for text in texts]
+            text_feat[empty_text_mask] = 0
+            # Create the pad mask for the text
+            batch_size, maxlen = text_feat.shape[:2]
+            tensor_text_length = torch.tensor(text_length, device=device)
+            tensor_text_length[empty_text_mask] = 0
+            text_pad_mask = torch.arange(maxlen, device=device).expand(batch_size, maxlen) < tensor_text_length[:, None]
+        if motion_mask is not None:
+            if motion_mask.dtype == torch.bool:
+                motion_mask = 1 * motion_mask
+        batch_size = text_feat.shape[0]
+        # sample loop
+        indices = list(range(num_denoising_steps))[::-1]
+        shape = (batch_size, max_frames, self.motion_rep.motion_rep_dim)
+        cur_mot = torch.randn(shape, device=self.device)
+        num_denoising_steps = torch.tensor(
+            [num_denoising_steps], device=self.device
+        )  # this and t need to be tensor for onnx export
+        # init diffusion with correct num steps before looping
+        use_timesteps = self.diffusion.space_timesteps(num_denoising_steps[0])[0]
+        self.diffusion.calc_diffusion_vars(use_timesteps)
+        for i in progress_bar(indices):
+            t = torch.tensor([i] * cur_mot.size(0), device=self.device)
+            with torch.inference_mode():
+                cur_mot = self.denoising_step(
+                    cur_mot,
+                    pad_mask,
+                    text_feat,
+                    text_pad_mask,
+                    t,
+                    first_heading_angle,
+                    motion_mask,
+                    observed_motion,
+                    num_denoising_steps,
+                    cfg_weight,
+                    guide_masks=guide_masks,
+                    cfg_type=cfg_type,
+                )
+        return cur_mot

kimodo/model/llm2vec/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ This is a patched version of the original [LLM2Vec](https://github.com/McGill-NLP/llm2vec) codebase so that `McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised` works with `transformers==5.0.0rc3`.

kimodo/model/llm2vec/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""LLM2Vec text encoder and wrapper for Kimodo."""
+from .llm2vec import LLM2Vec
+from .llm2vec_wrapper import LLM2VecEncoder
+__all__ = [
+    "LLM2Vec",
+    "LLM2VecEncoder",
+]

kimodo/model/llm2vec/llm2vec.py ADDED Viewed

	@@ -0,0 +1,477 @@

+# SPDX-FileCopyrightText: Copyright (c) 2024 McGill NLP
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import logging
+import os
+from functools import partial
+from typing import Dict, List, Optional, Union
+import numpy as np
+import torch
+import torch.multiprocessing as mp
+from peft import PeftModel
+from torch import Tensor, device, nn
+from tqdm.autonotebook import tqdm, trange
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoTokenizer,
+    GemmaConfig,
+    LlamaConfig,
+    MistralConfig,
+    PretrainedConfig,
+    Qwen2Config,
+)
+logger = logging.getLogger(__name__)
+def batch_to_device(batch, target_device: device):
+    """Send a pytorch batch to a device (CPU/GPU)"""
+    for key in batch:
+        if isinstance(batch[key], Tensor):
+            batch[key] = batch[key].to(target_device)
+    return batch
+class LLM2Vec(nn.Module):
+    def __init__(
+        self,
+        model: AutoModel,
+        tokenizer: AutoTokenizer,
+        pooling_mode: str = "mean",
+        max_length: int = 512,
+        doc_max_length: int = 400,
+        skip_instruction: bool = True,
+    ):
+        super().__init__()
+        self.model = model
+        self.tokenizer = tokenizer
+        self.pooling_mode = pooling_mode
+        self.skip_instruction = skip_instruction
+        self.max_length = max_length
+        self.doc_max_length = doc_max_length
+        self.config = model.config
+    @classmethod
+    def _get_model_class(cls, config_class_name, enable_bidirectional):
+        if not enable_bidirectional:
+            return AutoModel
+        if config_class_name == "MistralConfig":
+            from .models.bidirectional_mistral import MistralBiModel
+            return MistralBiModel
+        elif config_class_name == "LlamaConfig":
+            from .models.bidirectional_llama import LlamaBiModel
+            return LlamaBiModel
+        elif config_class_name == "GemmaConfig":
+            from .models.bidirectional_gemma import GemmaBiModel
+            return GemmaBiModel
+        elif config_class_name == "Qwen2Config":
+            from .models.bidirectional_qwen2 import Qwen2BiModel
+            return Qwen2BiModel
+        else:
+            raise ValueError(f"{config_class_name} is not supported yet with bidirectional models.")
+    @classmethod
+    def from_pretrained(
+        cls,
+        base_model_name_or_path,
+        peft_model_name_or_path=None,
+        merge_peft=False,
+        enable_bidirectional=True,
+        **kwargs,
+    ):
+        # pop out encoder args
+        keys = ["pooling_mode", "max_length", "doc_max_length", "skip_instruction"]
+        encoder_args = {key: kwargs.pop(key, None) for key in keys if kwargs.get(key) is not None}
+        tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path)
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "left"
+        config = AutoConfig.from_pretrained(base_model_name_or_path)
+        config_class_name = config.__class__.__name__
+        model_class = cls._get_model_class(config_class_name, enable_bidirectional=enable_bidirectional)
+        model = model_class.from_pretrained(base_model_name_or_path, **kwargs)
+        if os.path.isdir(base_model_name_or_path) and os.path.exists(f"{base_model_name_or_path}/config.json"):
+            with open(f"{base_model_name_or_path}/config.json", "r") as fIn:
+                config_dict = json.load(fIn)
+            config = PretrainedConfig.from_dict(config_dict)
+            model.config._name_or_path = config._name_or_path
+        # For special case where config.json and adapter weights are in the same directory
+        if hasattr(model, "peft_config"):
+            model = PeftModel.from_pretrained(
+                model,
+                base_model_name_or_path,
+            )
+            model = model.merge_and_unload()
+        if peft_model_name_or_path is not None:
+            model = PeftModel.from_pretrained(
+                model,
+                peft_model_name_or_path,
+            )
+            if merge_peft:
+                model = model.merge_and_unload()
+        config = {}
+        config_addr = peft_model_name_or_path if peft_model_name_or_path is not None else base_model_name_or_path
+        if os.path.exists(f"{config_addr}/llm2vec_config.json"):
+            with open(f"{config_addr}/llm2vec_config.json", "r") as fIn:
+                llm2vec_config = json.load(fIn)
+            config.update(llm2vec_config)
+        for key, value in encoder_args.items():
+            config[key] = value
+        return cls(model=model, tokenizer=tokenizer, **config)
+    def prepare_for_tokenization(self, text):
+        if self.model.config._name_or_path == "meta-llama/Meta-Llama-3-8B-Instruct":
+            text = "<|start_header_id|>user<|end_header_id|>\n\n" + text.strip() + "<|eot_id|>"
+            return text
+        if self.model.config._name_or_path in [
+            "mistralai/Mistral-7B-Instruct-v0.2",
+            "meta-llama/Llama-2-7b-chat-hf",
+        ]:
+            text = "[INST] " + text.strip() + " [/INST]"
+        if self.model.config._name_or_path in [
+            "google/gemma-2-9b-it",
+        ]:
+            text = "<bos><start_of_turn>user\n" + text.strip() + "<end_of_turn>"
+        if self.model.config._name_or_path in [
+            "Qwen/Qwen2-1.5B-Instruct",
+            "Qwen/Qwen2-7B-Instruct",
+        ]:
+            text = "<|im_start|>user\n" + text.strip() + "<|im_end|>"
+        if self.pooling_mode == "eos_token":
+            if self.model.config._name_or_path == "meta-llama/Meta-Llama-3-8B":
+                text = text.strip() + "<|end_of_text|>"
+            elif isinstance(self.model.config, LlamaConfig) or isinstance(self.model.config, MistralConfig):
+                text = text.strip() + " </s>"
+            elif isinstance(self.model.config, GemmaConfig):
+                text = text.strip() + "<eos>"
+            elif isinstance(self.model.config, Qwen2Config):
+                text = text.strip() + "<|endoftext|>"
+        return text
+    def tokenize(self, texts):
+        texts_2 = []
+        original_texts = []
+        for text in texts:
+            t = text.split("!@#$%^&*()")
+            texts_2.append(t[1] if len(t) > 1 else "")
+            original_texts.append("".join(t))
+        original = self.tokenizer(
+            original_texts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=self.max_length,
+        )
+        embed_mask = None
+        for t_i, t in enumerate(texts_2):
+            ids = self.tokenizer(
+                [t],
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=self.max_length,
+                add_special_tokens=False,
+            )
+            if embed_mask is None:
+                e_m = torch.zeros_like(original["attention_mask"][t_i])
+                if len(ids["input_ids"][0]) > 0:
+                    e_m[-len(ids["input_ids"][0]) :] = torch.ones(len(ids["input_ids"][0]))
+                embed_mask = e_m.unsqueeze(0)
+            else:
+                e_m = torch.zeros_like(original["attention_mask"][t_i])
+                if len(ids["input_ids"][0]) > 0:
+                    e_m[-len(ids["input_ids"][0]) :] = torch.ones(len(ids["input_ids"][0]))
+                embed_mask = torch.cat((embed_mask, e_m.unsqueeze(0)), dim=0)
+        original["embed_mask"] = embed_mask
+        return original
+    def _skip_instruction(self, sentence_feature):
+        assert sentence_feature["attention_mask"].shape == sentence_feature["embed_mask"].shape
+        sentence_feature["attention_mask"] = sentence_feature["embed_mask"]
+    def forward(self, sentence_feature: Dict[str, Tensor]):
+        embed_mask = None
+        if "embed_mask" in sentence_feature:
+            embed_mask = sentence_feature.pop("embed_mask")
+        reps = self.model(**sentence_feature)
+        sentence_feature["embed_mask"] = embed_mask
+        return self.get_pooling(sentence_feature, reps.last_hidden_state)
+    def get_pooling(self, features, last_hidden_states):  # All models padded from left
+        assert self.tokenizer.padding_side == "left", "Pooling modes are implemented for padding from left."
+        if self.skip_instruction:
+            self._skip_instruction(features)
+        seq_lengths = features["attention_mask"].sum(dim=-1)
+        if self.pooling_mode == "mean":
+            return torch.stack(
+                [last_hidden_states[i, -length:, :].mean(dim=0) for i, length in enumerate(seq_lengths)],
+                dim=0,
+            )
+        elif self.pooling_mode == "weighted_mean":
+            bs, l, _ = last_hidden_states.shape
+            complete_weights = torch.zeros(bs, l, device=last_hidden_states.device)
+            for i, seq_l in enumerate(seq_lengths):
+                if seq_l > 0:
+                    complete_weights[i, -seq_l:] = torch.arange(seq_l) + 1
+                    complete_weights[i] /= torch.clamp(complete_weights[i].sum(), min=1e-9)
+            return torch.sum(last_hidden_states * complete_weights.unsqueeze(-1), dim=1)
+        elif self.pooling_mode == "eos_token" or self.pooling_mode == "last_token":
+            return last_hidden_states[:, -1]
+        elif self.pooling_mode == "bos_token":
+            return last_hidden_states[features["input_ids"] == self.tokenizer.bos_token_id]
+        else:
+            raise ValueError(f"{self.pooling_mode} is not implemented yet.")
+    def _convert_to_str(self, instruction, text):
+        tokenized_q = self.tokenizer(
+            text,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=self.max_length,
+            add_special_tokens=False,
+        )
+        tokenized_q_length = len(tokenized_q["input_ids"][0])
+        while tokenized_q_length > self.doc_max_length:
+            reduction_ratio = self.doc_max_length / tokenized_q_length
+            reduced_length = int(len(text.split()) * reduction_ratio)
+            text = " ".join(text.split()[:reduced_length])
+            tokenized_q = self.tokenizer(
+                text,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=self.max_length,
+                add_special_tokens=False,
+            )
+            tokenized_q_length = len(tokenized_q["input_ids"][0])
+        return f"{instruction.strip()} !@#$%^&*(){text}" if instruction else f"!@#$%^&*(){text}"
+    def encode(
+        self,
+        sentences: Union[str, List[str]],
+        batch_size: int = 32,
+        show_progress_bar: bool = True,
+        convert_to_numpy: bool = False,
+        convert_to_tensor: bool = False,
+        device: Optional[str] = None,
+    ):
+        """
+        Encode a list of sentences to their respective embeddings. The sentences can be a list of strings or a string.
+        Args:
+            sentences: sentence or sentences to encode.
+            batch_size: batch size for turning sentence tokens into embeddings.
+            show_progress_bar: whether to show progress bars during encoding steps.
+            convert_to_numpy: If true, return numpy arrays instead of torch tensors.
+            convert_to_tensor: If true, return torch tensors (default).
+            device: torch backend device identifier (e.g., 'cuda', 'cpu','mps' etc.). If not specified,
+            the default is to use cuda when available, otherwise cpu. Note that only the choice of 'cuda' supports
+            multiprocessing as currently implemented.
+        Returns: embeddings of the sentences. Embeddings are detached and always on the CPU (see _encode implementation).
+        """
+        if isinstance(sentences[0], str) and isinstance(sentences[-1], int):
+            sentences = [sentences]
+        # required for MEDI version of MTEB
+        if isinstance(sentences[0], str):
+            sentences = [[""] + [sentence] for sentence in sentences]
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        concatenated_input_texts = []
+        for sentence in sentences:
+            assert isinstance(sentence[0], str)
+            assert isinstance(sentence[1], str)
+            concatenated_input_texts.append(self._convert_to_str(sentence[0], sentence[1]))
+        sentences = concatenated_input_texts
+        self.eval()
+        if convert_to_tensor:
+            convert_to_numpy = False
+        length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
+        sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
+        all_embeddings = []
+        if torch.cuda.device_count() <= 1:
+            # This branch also support mps devices
+            self.to(device)
+            for start_index in trange(
+                0,
+                len(sentences),
+                batch_size,
+                desc="Batches",
+                disable=not show_progress_bar,
+            ):
+                sentences_batch = sentences_sorted[start_index : start_index + batch_size]
+                embeddings = self._encode(sentences_batch, device=device, convert_to_numpy=convert_to_numpy)
+                all_embeddings.append(embeddings)
+        else:
+            num_proc = torch.cuda.device_count()
+            cuda_compatible_multiprocess = mp.get_context("spawn")
+            with cuda_compatible_multiprocess.Pool(num_proc) as p:
+                sentences_batches = [
+                    sentences_sorted[start_index : start_index + batch_size]
+                    for start_index in range(0, len(sentences), batch_size)
+                ]
+                progress_bar = tqdm(
+                    total=len(sentences_batches),
+                    desc="Batches",
+                    disable=not show_progress_bar,
+                )
+                results = []
+                def update(*args):
+                    progress_bar.update()
+                for batch in sentences_batches:
+                    results.append(
+                        p.apply_async(
+                            self._encode,
+                            args=(batch, None, convert_to_numpy, True),
+                            callback=update,
+                        )
+                    )
+                all_embeddings = [result.get() for result in results]
+                progress_bar.close()
+        all_embeddings = torch.cat(all_embeddings, dim=0)
+        all_embeddings = all_embeddings[np.argsort(length_sorted_idx)]
+        all_embeddings = all_embeddings.to(torch.float32)
+        if convert_to_numpy:
+            all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
+        return all_embeddings
+    def save(self, output_path, merge_before_save=False, save_config=True):
+        if merge_before_save and isinstance(self.model, PeftModel):
+            self.model = self.model.merge_and_unload()
+            # Fixes the issue of saving - https://huggingface.co/McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse/discussions/1
+            if hasattr(self.model, "_hf_peft_config_loaded"):
+                self.model._hf_peft_config_loaded = False
+        self.model.save_pretrained(output_path)
+        self.tokenizer.save_pretrained(output_path)
+        llm2vec_config = {
+            "pooling_mode": self.pooling_mode,
+            "max_length": self.max_length,
+            "doc_max_length": self.doc_max_length,
+            "skip_instruction": self.skip_instruction,
+        }
+        if save_config:
+            os.makedirs(output_path, exist_ok=True)
+            with open(f"{output_path}/llm2vec_config.json", "w") as fOut:
+                json.dump(llm2vec_config, fOut, indent=4)
+    def _encode(
+        self,
+        sentences_batch,
+        device: Optional[str] = None,
+        convert_to_numpy: bool = False,
+        multiprocessing=False,
+    ):
+        if multiprocessing:
+            # multiprocessing only supports CUDA devices at this time, so we ignore the value of device
+            # and use cuda:rank for the device
+            rank = mp.current_process()._identity[0]
+            if device is None and torch.cuda.is_available():
+                device = f"cuda:{rank % torch.cuda.device_count()}"
+        self.to(device)
+        features = self.tokenize([self.prepare_for_tokenization(sentence) for sentence in sentences_batch])
+        features = batch_to_device(features, device)
+        with torch.no_grad():
+            embeddings = self.forward(features)
+            embeddings = embeddings.detach()
+            embeddings = embeddings.cpu()
+        return embeddings
+    def _text_length(self, text: Union[List[int], List[List[int]]]):
+        """Help function to get the length for the input text.
+        Text can be either a string (which means a single text) a list of ints (which means a single
+        tokenized text), or a tuple of list of ints (representing several text inputs to the model).
+        """
+        if (
+            isinstance(text, str) or (isinstance(text, list) and isinstance(text[0], int)) or len(text) == 0
+        ):  # Single text, list of ints, or empty
+            return len(text)
+        if isinstance(text, dict):  # {key: value} case
+            return len(next(iter(text.values())))
+        elif not hasattr(text, "__len__"):  # Object has no len() method
+            return 1
+        else:
+            return sum([len(t) for t in text])
+    def resize_token_embeddings(
+        self,
+        new_num_tokens: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+    ) -> nn.Embedding:
+        return self.model.resize_token_embeddings(new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of)
+    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
+        self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)

kimodo/model/llm2vec/llm2vec_wrapper.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""LLM2Vec encoder wrapper for Kimodo text conditioning."""
+import os
+import numpy as np
+import torch
+from .llm2vec import LLM2Vec
+class LLM2VecEncoder:
+    """LLM2Vec text embeddings."""
+    def __init__(
+        self,
+        base_model_name_or_path: str,
+        peft_model_name_or_path: str,
+        dtype: str,
+        llm_dim: int,
+    ) -> None:
+        torch_dtype = getattr(torch, dtype)
+        self.llm_dim = llm_dim
+        cache_dir = os.environ.get("HUGGINGFACE_CACHE_DIR")
+        if "TEXT_ENCODERS_DIR" in os.environ:
+            base_model_name_or_path = os.path.join(os.environ["TEXT_ENCODERS_DIR"], base_model_name_or_path)
+            peft_model_name_or_path = os.path.join(os.environ["TEXT_ENCODERS_DIR"], peft_model_name_or_path)
+        self.model = LLM2Vec.from_pretrained(
+            base_model_name_or_path=base_model_name_or_path,
+            peft_model_name_or_path=peft_model_name_or_path,
+            torch_dtype=torch_dtype,
+            cache_dir=cache_dir,
+        )
+        self.model.eval()
+        for p in self.model.parameters():
+            p.requires_grad = False
+    def to(self, device: torch.device):
+        self.model = self.model.to(device)
+        return self
+    def eval(self):
+        self.model.eval()
+        return self
+    def get_device(self):
+        return self.model.model.device
+    def __call__(self, text: list[str] | str):
+        is_string = False
+        if isinstance(text, str):
+            text = [text]
+            is_string = True
+        with torch.no_grad():
+            encoded_text = self.model.encode(text, batch_size=len(text), show_progress_bar=False)
+        assert len(encoded_text.shape)
+        assert self.llm_dim == encoded_text.shape[-1]
+        encoded_text = encoded_text[:, None]
+        lengths = np.ones(len(encoded_text), dtype=int).tolist()
+        if is_string:
+            encoded_text = encoded_text[0]
+            lengths = lengths[0]
+        encoded_text = torch.tensor(encoded_text).to(self.get_device())
+        return encoded_text, lengths

kimodo/model/llm2vec/models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# from .bidirectional_gemma import GemmaBiForMNTP, GemmaBiModel
+# from .bidirectional_llama import LlamaBiForMNTP, LlamaBiModel
+# from .bidirectional_mistral import MistralBiForMNTP, MistralBiModel
+# from .bidirectional_qwen2 import Qwen2BiForMNTP, Qwen2BiModel

kimodo/model/llm2vec/models/attn_mask_utils.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# SPDX-FileCopyrightText: Copyright (c) 2024 McGill NLP
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+from typing import List, Optional, Tuple, Union
+import torch
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+def _prepare_4d_causal_attention_mask(
+    attention_mask: Optional[torch.Tensor],
+    input_shape: Union[torch.Size, Tuple, List],
+    inputs_embeds: torch.Tensor,
+    past_key_values_length: int,
+    sliding_window: Optional[int] = None,
+):
+    """Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D
+    mask of shape `(batch_size, key_value_length)`
+    Args:
+        attention_mask (`torch.Tensor` or `None`):
+            A 2D attention mask of shape `(batch_size, key_value_length)`
+        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
+            The input shape should be a tuple that defines `(batch_size, query_length)`.
+        inputs_embeds (`torch.Tensor`):
+            The embedded inputs as a torch Tensor.
+        past_key_values_length (`int`):
+            The length of the key value cache.
+        sliding_window (`int`, *optional*):
+            If the model uses windowed attention, a sliding window should be passed.
+    """
+    attn_mask_converter = AttentionMaskConverter(
+        is_causal=False, sliding_window=sliding_window
+    )  # is_causal=True in original implementation
+    key_value_length = input_shape[-1] + past_key_values_length
+    # 4d mask is passed through the layers
+    if attention_mask is not None and len(attention_mask.shape) == 2:
+        attention_mask = attn_mask_converter.to_4d(
+            attention_mask,
+            input_shape[-1],
+            key_value_length=key_value_length,
+            dtype=inputs_embeds.dtype,
+        )
+    elif attention_mask is not None and len(attention_mask.shape) == 4:
+        expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
+        if tuple(attention_mask.shape) != expected_shape:
+            raise ValueError(
+                f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
+            )
+        else:
+            # if the 4D mask has correct shape - invert it and fill with negative infinity
+            inverted_mask = 1.0 - attention_mask
+            attention_mask = inverted_mask.masked_fill(
+                inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
+            )
+    else:
+        attention_mask = attn_mask_converter.to_causal_4d(
+            input_shape[0],
+            input_shape[-1],
+            key_value_length,
+            dtype=inputs_embeds.dtype,
+            device=inputs_embeds.device,
+        )
+    return attention_mask
+# Adapted from _prepare_4d_causal_attention_mask
+def _prepare_4d_causal_attention_mask_for_sdpa(
+    attention_mask: Optional[torch.Tensor],
+    input_shape: Union[torch.Size, Tuple, List],
+    inputs_embeds: torch.Tensor,
+    past_key_values_length: int,
+    sliding_window: Optional[int] = None,
+):
+    """Prepares the correct `attn_mask` argument to be used by
+    `torch.nn.functional.scaled_dot_product_attention`.
+    In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and
+    `key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks,
+    allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed).
+    """
+    attn_mask_converter = AttentionMaskConverter(
+        is_causal=False, sliding_window=sliding_window
+    )  # is_causal=True in original implementation
+    key_value_length = input_shape[-1] + past_key_values_length
+    batch_size, query_length = input_shape
+    # torch.jit.trace, symbolic_trace and torchdynamo with fullgraph=True are unable to capture the controlflow `is_causal=attention_mask is None and q_len > 1`
+    # used as an SDPA argument. We keep compatibility with these tracing tools by always using SDPA's `attn_mask` argument in case we are tracing.
+    # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
+    is_tracing = (
+        torch.jit.is_tracing()
+        or isinstance(inputs_embeds, torch.fx.Proxy)
+        or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
+    )
+    if attention_mask is not None:
+        # 4d mask is passed through
+        if len(attention_mask.shape) == 4:
+            expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
+            if tuple(attention_mask.shape) != expected_shape:
+                raise ValueError(
+                    f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
+                )
+            else:
+                # if the 4D mask has correct shape - invert it and fill with negative infinity
+                inverted_mask = 1.0 - attention_mask.to(inputs_embeds.dtype)
+                attention_mask = inverted_mask.masked_fill(
+                    inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
+                )
+                return attention_mask
+        elif not is_tracing and torch.all(attention_mask == 1):
+            if query_length == 1:
+                # For query_length == 1, causal attention and bi-directional attention are the same.
+                attention_mask = None
+            elif key_value_length == query_length:
+                attention_mask = None
+            else:
+                # Unfortunately, for query_length > 1 and key_value_length != query_length, we cannot generally ignore the attention mask, as SDPA causal mask generation
+                # may be wrong. We will set `is_causal=False` in SDPA and rely on Transformers attention_mask instead, hence not setting it to None here.
+                # Reference: https://github.com/pytorch/pytorch/issues/108108
+                pass
+    elif query_length > 1 and key_value_length != query_length:
+        # See the comment above (https://github.com/pytorch/pytorch/issues/108108).
+        # Ugly: we set it to True here to dispatch in the following controlflow to `to_causal_4d`.
+        attention_mask = True
+    elif is_tracing:
+        raise ValueError(
+            'Attention using SDPA can not be traced with torch.jit.trace when no attention_mask is provided. To solve this issue, please either load your model with the argument `attn_implementation="eager"` or pass an attention_mask input when tracing the model.'
+        )
+    if attention_mask is None:
+        expanded_4d_mask = None
+    elif attention_mask is True:
+        expanded_4d_mask = attn_mask_converter.to_causal_4d(
+            input_shape[0],
+            input_shape[-1],
+            key_value_length,
+            dtype=inputs_embeds.dtype,
+            device=inputs_embeds.device,
+        )
+    else:
+        expanded_4d_mask = attn_mask_converter.to_4d(
+            attention_mask,
+            input_shape[-1],
+            dtype=inputs_embeds.dtype,
+            key_value_length=key_value_length,
+        )
+        # Attend to all tokens in masked rows from the causal_mask, for example the relevant first rows when
+        # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+        # Details: https://github.com/pytorch/pytorch/issues/110213
+        if not is_tracing and expanded_4d_mask.device.type == "cuda":
+            expanded_4d_mask = AttentionMaskConverter._unmask_unattended(
+                expanded_4d_mask, min_dtype=torch.finfo(inputs_embeds.dtype).min
+            )
+    return expanded_4d_mask

kimodo/model/llm2vec/models/bidirectional_llama.py ADDED Viewed

	@@ -0,0 +1,224 @@

+# SPDX-FileCopyrightText: Copyright (c) 2024 McGill NLP
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from peft import PeftModel
+from torch import nn
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaModel, LlamaPreTrainedModel
+from transformers.cache_utils import Cache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    # LlamaFlashAttention2,
+    LlamaMLP,
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+    # LlamaSdpaAttention,
+)
+from transformers.utils import logging
+from .utils import is_transformers_attn_greater_or_equal_4_43_1
+logger = logging.get_logger(__name__)
+class ModifiedLlamaAttention(LlamaAttention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False
+# class ModifiedLlamaFlashAttention2(LlamaFlashAttention2):
+#     def __init__(self, *args, **kwargs):
+#         super().__init__(*args, **kwargs)
+#         self.is_causal = False
+# class ModifiedLlamaSdpaAttention(LlamaSdpaAttention):
+#     def __init__(self, *args, **kwargs):
+#         super().__init__(*args, **kwargs)
+#         self.is_causal = False
+# LLAMA_ATTENTION_CLASSES = {
+#     "eager": ModifiedLlamaAttention,
+#     "flash_attention_2": ModifiedLlamaFlashAttention2,
+#     "sdpa": ModifiedLlamaSdpaAttention,
+# }
+class ModifiedLlamaDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: LlamaConfig, layer_idx: int):
+        nn.Module.__init__(self)
+        self.hidden_size = config.hidden_size
+        self.self_attn = ModifiedLlamaAttention(config=config, layer_idx=layer_idx)
+        # self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](
+        # config=config, layer_idx=layer_idx
+        # )
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+class LlamaBiModel(LlamaModel):
+    _no_split_modules = ["ModifiedLlamaDecoderLayer"]
+    def __init__(self, config: LlamaConfig):
+        if not is_transformers_attn_greater_or_equal_4_43_1():
+            raise ValueError(
+                "The current implementation of LlamaEncoderModel follows modeling_llama.py of transformers version >= 4.43.1"
+            )
+        LlamaPreTrainedModel.__init__(self, config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [ModifiedLlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _update_causal_mask(
+        self,
+        attention_mask,
+        input_tensor,
+        cache_position,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        # if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+        #     if AttentionMaskConverter._ignore_causal_mask_sdpa(
+        #         attention_mask,
+        #         inputs_embeds=input_tensor,
+        #         past_key_values_length=past_seen_tokens,
+        #         is_training=self.training,
+        #     ):
+        #         return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        causal_mask = torch.zeros(
+            (sequence_length, target_length), dtype=dtype, device=device
+        )  # in original implementation - torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        # Commenting out next 2 lines to disable causal masking
+        # if sequence_length != 1:
+        #     causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
+            elif attention_mask.dim() == 4:
+                # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
+                # cache. In that case, the 4D attention mask attends to the newest tokens only.
+                if attention_mask.shape[-2] < cache_position[0] + sequence_length:
+                    offset = cache_position[0]
+                else:
+                    offset = 0
+                mask_shape = attention_mask.shape
+                mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
+                causal_mask[
+                    : mask_shape[0],
+                    : mask_shape[1],
+                    offset : mask_shape[2] + offset,
+                    : mask_shape[3],
+                ] = mask_slice
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+class LlamaBiForMNTP(LlamaForCausalLM):
+    def __init__(self, config):
+        LlamaPreTrainedModel.__init__(self, config)
+        self.model = LlamaBiModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    # getter for PEFT model
+    def get_model_for_peft(self):
+        return self.model
+    # setter for PEFT model
+    def set_model_for_peft(self, model: PeftModel):
+        self.model = model
+    # save the PEFT model
+    def save_peft_model(self, path):
+        self.model.save_pretrained(path)

kimodo/model/llm2vec/models/utils.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# SPDX-FileCopyrightText: Copyright (c) 2024 McGill NLP
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+import importlib.metadata
+from packaging import version
+from transformers.utils.import_utils import _is_package_available
+def is_transformers_attn_greater_or_equal_4_43_1():
+    if not _is_package_available("transformers"):
+        return False
+    return version.parse(importlib.metadata.version("transformers")) >= version.parse("4.43.1")

kimodo/model/load_model.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Load Kimodo diffusion models from local checkpoints or Hugging Face."""
+from pathlib import Path
+from typing import Optional
+from huggingface_hub import snapshot_download
+from omegaconf import OmegaConf
+from .loading import (
+    AVAILABLE_MODELS,
+    DEFAULT_MODEL,
+    DEFAULT_TEXT_ENCODER_URL,
+    MODEL_NAMES,
+    TMR_MODELS,
+    get_env_var,
+    instantiate_from_dict,
+)
+from .registry import get_model_info, resolve_model_name
+DEFAULT_TEXT_ENCODER = "llm2vec"
+TEXT_ENCODER_PRESETS = {
+    "llm2vec": {
+        "target": "kimodo.model.LLM2VecEncoder",
+        "kwargs": {
+            "base_model_name_or_path": "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp",
+            "peft_model_name_or_path": "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised",
+            "dtype": "bfloat16",
+            "llm_dim": 4096,
+        },
+    }
+}
+def _resolve_hf_model_path(modelname: str) -> Path:
+    """Resolve model name to a local path, using Hugging Face cache or CHECKPOINT_DIR."""
+    try:
+        repo_id = MODEL_NAMES[modelname]
+    except KeyError:
+        raise ValueError(f"Model '{modelname}' not found. Available models: {MODEL_NAMES.keys()}")
+    local_cache = get_env_var("LOCAL_CACHE", "False").lower() == "true"
+    if not local_cache:
+        snapshot_dir = snapshot_download(repo_id=repo_id)  # will check online no matter what
+        return Path(snapshot_dir)
+    try:
+        snapshot_dir = snapshot_download(repo_id=repo_id, local_files_only=True)  # will check local cache only
+        return Path(snapshot_dir)
+    except Exception:
+        # if local cache is not found, download from online
+        try:
+            snapshot_dir = snapshot_download(repo_id=repo_id)
+            return Path(snapshot_dir)
+        except Exception:
+            raise RuntimeError(f"Could not resolve model '{modelname}' from Hugging Face (repo: {repo_id}). ") from None
+def _build_api_text_encoder_conf(text_encoder_url: str) -> dict:
+    return {
+        "_target_": "kimodo.model.text_encoder_api.TextEncoderAPI",
+        "url": text_encoder_url,
+    }
+def _build_local_text_encoder_conf() -> dict:
+    text_encoder_name = get_env_var("TEXT_ENCODER", DEFAULT_TEXT_ENCODER)
+    if text_encoder_name not in TEXT_ENCODER_PRESETS:
+        available = ", ".join(sorted(TEXT_ENCODER_PRESETS))
+        raise ValueError(f"Unknown TEXT_ENCODER='{text_encoder_name}'. Available: {available}")
+    preset = TEXT_ENCODER_PRESETS[text_encoder_name]
+    return {
+        "_target_": preset["target"],
+        **preset["kwargs"],
+    }
+def _select_text_encoder_conf(text_encoder_url: str) -> dict:
+    # TEXT_ENCODER_MODE options:
+    # - "api": force TextEncoderAPI
+    # - "local": force local LLM2VecEncoder
+    # - "auto": try API first, fallback to local if unreachable
+    mode = get_env_var("TEXT_ENCODER_MODE", "auto").lower()
+    if mode == "local":
+        return _build_local_text_encoder_conf()
+    if mode == "api":
+        return _build_api_text_encoder_conf(text_encoder_url)
+    api_conf = _build_api_text_encoder_conf(text_encoder_url)
+    try:
+        text_encoder = instantiate_from_dict(api_conf)
+        # Probe availability early so inference doesn't fail later.
+        text_encoder(["healthcheck"])
+        return api_conf
+    except Exception as error:
+        print(
+            "Text encoder service is unreachable, falling back to local LLM2Vec "
+            f"encoder. ({type(error).__name__}: {error})"
+        )
+        return _build_local_text_encoder_conf()
+def load_model(
+    modelname=None,
+    device=None,
+    eval_mode: bool = True,
+    default_family: Optional[str] = "Kimodo",
+    return_resolved_name: bool = False,
+):
+    """Load a kimodo model by name (e.g. 'g1', 'soma').
+    Resolution of partial/full names (e.g. Kimodo-SOMA-RP-v1, SOMA) is done
+    inside this function using default_family when the name is not a known
+    short key.
+    Args:
+        modelname: Model identifier; uses DEFAULT_MODEL if None. Can be a short key,
+            a full name (e.g. Kimodo-SOMA-RP-v1), or a partial name; unknown names
+            are resolved via resolve_model_name using default_family.
+        device: Target device for the model (e.g. 'cuda', 'cpu').
+        eval_mode: If True, set model to eval mode.
+        default_family: Used when modelname is not in AVAILABLE_MODELS to resolve
+            partial names ("Kimodo" for demo/generation, "TMR" for embed script).
+            Default "Kimodo".
+        return_resolved_name: If True, return (model, resolved_short_key). If False,
+            return only the model.
+    Returns:
+        Loaded model in eval mode, or (model, resolved short key) if
+        return_resolved_name is True.
+    Raises:
+        ValueError: If modelname is not in AVAILABLE_MODELS and cannot be resolved.
+        FileNotFoundError: If config.yaml is missing in the checkpoint folder.
+    """
+    if modelname is None:
+        modelname = DEFAULT_MODEL
+    if modelname not in AVAILABLE_MODELS:
+        if default_family is not None:
+            modelname = resolve_model_name(modelname, default_family)
+        else:
+            raise ValueError(
+                f"""The model is not recognized.
+            Please choose between: {AVAILABLE_MODELS}"""
+            )
+    resolved_modelname = modelname
+    # In case, we specify a custom checkpoint directory
+    configured_checkpoint_dir = get_env_var("CHECKPOINT_DIR")
+    if configured_checkpoint_dir:
+        print(f"CHECKPOINT_DIR is set to {configured_checkpoint_dir}, checking the local cache...")
+        # Checkpoint folders are named by display name (e.g. Kimodo-SOMA-RP-v1)
+        info = get_model_info(modelname)
+        checkpoint_folder_name = info.display_name if info is not None else modelname
+        model_path = Path(configured_checkpoint_dir) / checkpoint_folder_name
+        if not model_path.exists() and modelname != checkpoint_folder_name:
+            # Fallback: try short_key for backward compatibility
+            model_path = Path(configured_checkpoint_dir) / modelname
+        if not model_path.exists():
+            print(f"Model folder not found at '{model_path}', downloading it from Hugging Face...")
+            model_path = _resolve_hf_model_path(modelname)
+    else:
+        # Otherwise, we load the model from the local cache or download it from Hugging Face.
+        model_path = _resolve_hf_model_path(modelname)
+    model_config_path = model_path / "config.yaml"
+    if not model_config_path.exists():
+        raise FileNotFoundError(f"The model checkpoint folder exists but config.yaml is missing: {model_config_path}")
+    model_conf = OmegaConf.load(model_config_path)
+    if modelname in TMR_MODELS:
+        # Same process at the moment for TMR and Kimodo
+        pass
+    text_encoder_url = get_env_var("TEXT_ENCODER_URL", DEFAULT_TEXT_ENCODER_URL)
+    runtime_conf = OmegaConf.create(
+        {
+            "checkpoint_dir": str(model_path),
+            "text_encoder": _select_text_encoder_conf(text_encoder_url),
+        }
+    )
+    model_cfg = OmegaConf.to_container(OmegaConf.merge(model_conf, runtime_conf), resolve=True)
+    model_cfg.pop("checkpoint_dir", None)
+    model = instantiate_from_dict(model_cfg, overrides={"device": device})
+    if eval_mode:
+        model = model.eval()
+    if return_resolved_name:
+        return model, resolved_modelname
+    return model

kimodo/model/loading.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Model loading utilities: checkpoints, registry, env, and Hydra-based instantiation."""
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+import torch
+from hydra.utils import instantiate
+from omegaconf import OmegaConf
+from safetensors.torch import load_file as load_safetensors
+from .registry import (
+    AVAILABLE_MODELS,
+    DEFAULT_MODEL,
+    DEFAULT_TEXT_ENCODER_URL,
+    KIMODO_MODELS,
+    MODEL_NAMES,
+    TMR_MODELS,
+)
+def get_env_var(name: str, default: Optional[str] = None) -> Optional[str]:
+    """Return environment variable value, or default if unset/empty."""
+    return os.environ.get(name) or default
+def instantiate_from_dict(
+    cfg: Dict[str, Any],
+    overrides: Optional[Dict[str, Any]] = None,
+):
+    """Instantiate an object from a config dict (e.g. from OmegaConf.to_container).
+    The dict must contain _target_ with a fully qualified class path. Nested configs are
+    instantiated recursively.
+    """
+    if overrides:
+        cfg = {**cfg, **overrides}
+    conf = OmegaConf.create(cfg)
+    return instantiate(conf)
+def load_checkpoint_state_dict(ckpt_path: Union[str, Path]) -> dict:
+    """Load a state dict from a checkpoint file.
+    If the checkpoint is a dict with a 'state_dict' key (e.g. PyTorch Lightning),
+    that is returned; otherwise the whole checkpoint is treated as the state dict.
+    Args:
+        ckpt_path: Path to the checkpoint file.
+    Returns:
+        state_dict suitable for model.load_state_dict().
+    """
+    ckpt_path = str(ckpt_path)
+    if ckpt_path.endswith(".safetensors"):
+        state_dict = load_safetensors(ckpt_path)
+    else:
+        checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=False)
+        if isinstance(checkpoint, dict) and "state_dict" in checkpoint:
+            state_dict = checkpoint["state_dict"]
+        elif isinstance(checkpoint, dict):
+            state_dict = checkpoint
+        else:
+            raise ValueError(f"Unsupported checkpoint format: {ckpt_path}")
+    return {key: val.detach().cpu() for key, val in state_dict.items()}
+__all__ = [
+    "get_env_var",
+    "instantiate_from_dict",
+    "KIMODO_MODELS",
+    "TMR_MODELS",
+    "AVAILABLE_MODELS",
+    "MODEL_NAMES",
+    "DEFAULT_MODEL",
+    "DEFAULT_TEXT_ENCODER_URL",
+    "load_checkpoint_state_dict",
+]

kimodo/model/registry.py ADDED Viewed

	@@ -0,0 +1,473 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Registry of model names and Hugging Face repo IDs for Kimodo and TMR.
+Canonical source of truth is the list of repo IDs. Short keys (e.g. soma-rp) and metadata (dataset,
+skeleton, version, display name) are derived by parsing.
+"""
+import re
+from dataclasses import dataclass
+from typing import Optional
+# Canonical list: repo IDs in the same syntax as Hugging Face (org/Model-Name-v1).
+# Parser expects: org/Family-SKELETON-DATASET-version (e.g. Kimodo-SOMA-RP-v1).
+KIMODO_REPO_IDS = [
+    "nvidia/Kimodo-SOMA-RP-v1",
+    "nvidia/Kimodo-SMPLX-RP-v1",
+    "nvidia/Kimodo-G1-RP-v1",
+    "nvidia/Kimodo-SOMA-SEED-v1",
+    "nvidia/Kimodo-G1-SEED-v1",
+]
+TMR_REPO_IDS = [
+    "nvidia/TMR-SOMA-RP-v1",
+]
+# Repo ID without org, for display (e.g. Kimodo-SOMA-RP-v1).
+_REPO_NAME_PATTERN = re.compile(r"^(Kimodo|TMR)-([A-Za-z0-9]+)-(RP|SEED)-v(\d+)$")
+@dataclass
+class ModelInfo:
+    """Structured metadata for one model, derived from its repo ID."""
+    repo_id: str
+    short_key: str
+    family: str
+    skeleton: str
+    dataset: str
+    version: str
+    display_name: str
+    @property
+    def dataset_ui_label(self) -> str:
+        return "Rigplay" if self.dataset == "RP" else "SEED"
+def _parse_repo_id(repo_id: str) -> Optional[ModelInfo]:
+    """Parse a repo ID into ModelInfo.
+    Returns None if format is unrecognized.
+    """
+    # repo_id is "org/Model-Name-v1"
+    if "/" in repo_id:
+        _, name = repo_id.split("/", 1)
+    else:
+        name = repo_id
+    m = _REPO_NAME_PATTERN.match(name)
+    if not m:
+        return None
+    family, skeleton, dataset, ver = m.groups()
+    # Normalize skeleton for display (as is for now)
+    skeleton_display = skeleton
+    # Include family so Kimodo-SOMA-RP and TMR-SOMA-RP have distinct keys.
+    short_key = f"{family.lower()}-{skeleton.lower()}-{dataset.lower()}"
+    return ModelInfo(
+        repo_id=repo_id,
+        short_key=short_key,
+        family=family,
+        skeleton=skeleton_display,
+        dataset=dataset,
+        version=f"v{ver}",
+        display_name=name,
+    )
+def _build_registry() -> tuple[list[ModelInfo], dict[str, str], list[str]]:
+    """Build model infos, short_key -> repo_id map, and list of short keys.
+    When multiple versions exist for the same (family, skeleton, dataset), the base short_key (e.g.
+    kimodo-soma-rp) maps to the latest version's repo_id so that HF resolution finds the newest
+    model.
+    """
+    def _version_key(info: ModelInfo) -> int:
+        v = info.version
+        if v.startswith("v") and v[1:].isdigit():
+            return int(v[1:])
+        return 0
+    all_repos = KIMODO_REPO_IDS + TMR_REPO_IDS
+    infos: list[ModelInfo] = []
+    for repo_id in all_repos:
+        info = _parse_repo_id(repo_id)
+        if info is None:
+            raise ValueError(f"Registry repo ID does not match expected pattern: {repo_id}")
+        infos.append(info)
+    # Map each base short_key to the latest version's repo_id (by version number)
+    model_names: dict[str, str] = {}
+    seen_short_keys: set[str] = set()
+    for info in infos:
+        if info.short_key in seen_short_keys:
+            continue
+        seen_short_keys.add(info.short_key)
+        candidates = [
+            i for i in infos if i.family == info.family and i.skeleton == info.skeleton and i.dataset == info.dataset
+        ]
+        if candidates:
+            latest = max(candidates, key=_version_key)
+            model_names[info.short_key] = latest.repo_id
+    return infos, model_names, list(model_names.keys())
+MODEL_INFOS, MODEL_NAMES, _SHORT_KEYS = _build_registry()
+AVAILABLE_MODELS = _SHORT_KEYS
+# Short-key lists for Kimodo vs TMR (load_model uses TMR_MODELS to branch).
+KIMODO_MODELS = [info.short_key for info in MODEL_INFOS if info.family == "Kimodo"]
+TMR_MODELS = [info.short_key for info in MODEL_INFOS if info.family == "TMR"]
+# Backward compatibility: FRIENDLY_NAMES for any code that still expects it.
+FRIENDLY_NAMES = {info.short_key: info.display_name for info in MODEL_INFOS}
+DEFAULT_MODEL = "kimodo-soma-rp"
+DEFAULT_TEXT_ENCODER_URL = "http://127.0.0.1:9550/"
+# Friendly names for skeleton dropdown (key -> label).
+SKELETON_DISPLAY_NAMES = {
+    "SOMA": "SOMA Human Body",
+    "SMPLX": "SMPLX Human Body",
+    "G1": "Unitree G1 Humanoid Robot",
+}
+# Order for skeleton dropdown: SOMA, SMPLX, G1.
+SKELETON_ORDER = ("SOMA", "SMPLX", "G1")
+def get_skeleton_display_name(skeleton_key: str) -> str:
+    """Return the UI label for a skeleton key (e.g. SOMA -> SOMA Human Body)."""
+    return SKELETON_DISPLAY_NAMES.get(skeleton_key, skeleton_key)
+def get_skeleton_key_from_display_name(display_name: str) -> Optional[str]:
+    """Return the skeleton key for a UI label, or None."""
+    for key, label in SKELETON_DISPLAY_NAMES.items():
+        if label == display_name:
+            return key
+    return None
+def get_skeleton_display_names_for_dataset(dataset_ui_label: str, family: Optional[str] = None) -> list[str]:
+    """Return skeleton UI labels for the given dataset.
+    If family is set (e.g. "Kimodo"), only skeletons with a model of that family are included.
+    """
+    keys = get_skeletons_for_dataset(dataset_ui_label, family=family)
+    return [get_skeleton_display_name(k) for k in keys]
+def get_short_key(repo_id: str) -> Optional[str]:
+    """Return the short key for a repo ID, or None if not in registry."""
+    for info in MODEL_INFOS:
+        if info.repo_id == repo_id:
+            return info.short_key
+    return None
+def get_model_info(short_key: str) -> Optional[ModelInfo]:
+    """Return ModelInfo for a short key, or None if not found.
+    When multiple versions share the same short_key, returns the one used for loading (the latest
+    version), so CHECKPOINT_DIR and HF use the same version.
+    """
+    repo_id = MODEL_NAMES.get(short_key)
+    if repo_id is None:
+        return None
+    for info in MODEL_INFOS:
+        if info.repo_id == repo_id:
+            return info
+    return None
+def get_short_key_from_display_name(display_name: str) -> Optional[str]:
+    """Return short_key for a display name (e.g. Kimodo-SOMA-RP-v1), or None."""
+    for info in MODEL_INFOS:
+        if info.display_name == display_name:
+            return info.short_key
+    return None
+def get_models_for_demo() -> list[ModelInfo]:
+    """Return all model infos in registry order (for demo model list)."""
+    return list(MODEL_INFOS)
+def get_datasets(family: Optional[str] = None) -> list[str]:
+    """Return unique dataset UI labels (Rigplay, SEED) present in registry.
+    If family is set (e.g. "Kimodo"), only datasets that have a model of that family are included.
+    """
+    infos = MODEL_INFOS
+    if family is not None:
+        infos = [i for i in infos if i.family == family]
+    labels = set()
+    for info in infos:
+        labels.add(info.dataset_ui_label)
+    return sorted(labels)
+def get_skeletons_for_dataset(dataset_ui_label: str, family: Optional[str] = None) -> list[str]:
+    """Return skeleton names that have a model for the given dataset.
+    Order: SOMA, SMPLX, G1 (only those present for the dataset).
+    If family is set (e.g. "Kimodo"), only skeletons with a model of that
+    family are included.
+    """
+    dataset = "RP" if dataset_ui_label == "Rigplay" else "SEED"
+    infos = MODEL_INFOS
+    if family is not None:
+        infos = [i for i in infos if i.family == family]
+    skeletons = set()
+    for info in infos:
+        if info.dataset == dataset:
+            skeletons.add(info.skeleton)
+    return [s for s in SKELETON_ORDER if s in skeletons]
+def get_versions_for_dataset_skeleton(dataset_ui_label: str, skeleton: str) -> list[str]:
+    """Return version strings (e.g. v1) for the given dataset/skeleton.
+    Sorted by version number so the last element is the highest (e.g. v1, v2).
+    """
+    dataset = "RP" if dataset_ui_label == "Rigplay" else "SEED"
+    versions = []
+    for info in MODEL_INFOS:
+        if info.dataset == dataset and info.skeleton == skeleton:
+            versions.append(info.version)
+    # Sort by numeric part so v2 comes after v1.
+    def version_key(v: str) -> int:
+        if v.startswith("v") and v[1:].isdigit():
+            return int(v[1:])
+        return 0
+    return sorted(set(versions), key=version_key)
+def get_models_for_dataset_skeleton(
+    dataset_ui_label: str, skeleton: str, family: Optional[str] = None
+) -> list[ModelInfo]:
+    """Return model infos for the given dataset/skeleton, sorted by version (max first).
+    Used to build the Version dropdown (options = full display names, one per model). If family is
+    set (e.g. "Kimodo"), only models of that family are returned.
+    """
+    dataset = "RP" if dataset_ui_label == "Rigplay" else "SEED"
+    infos = [info for info in MODEL_INFOS if info.dataset == dataset and info.skeleton == skeleton]
+    if family is not None:
+        infos = [i for i in infos if i.family == family]
+    def version_key(info: ModelInfo) -> int:
+        v = info.version
+        if v.startswith("v") and v[1:].isdigit():
+            return int(v[1:])
+        return 0
+    return sorted(infos, key=version_key, reverse=True)
+def resolve_to_short_key(dataset_ui_label: str, skeleton: str, version: str) -> Optional[str]:
+    """Return the short key for (dataset, skeleton, version), or None."""
+    for info in MODEL_INFOS:
+        if info.dataset_ui_label == dataset_ui_label and info.skeleton == skeleton and info.version == version:
+            return info.short_key
+    return None
+# -----------------------------------------------------------------------------
+# Flexible model name resolution (partial names, case-insensitive, defaults)
+# -----------------------------------------------------------------------------
+_FAMILY_ALIASES = {"kimodo": "Kimodo", "tmr": "TMR"}
+_DATASET_ALIASES = {"rp": "RP", "rigplay": "RP", "seed": "SEED"}
+_SKELETON_ALIASES = {
+    "soma": "SOMA",
+    "smplx": "SMPLX",
+    "g1": "G1",
+}
+def _normalize_family(s: str) -> Optional[str]:
+    """Return canonical family (Kimodo/TMR) or None if unknown."""
+    return _FAMILY_ALIASES.get(s.strip().lower())
+def _normalize_dataset(s: str) -> Optional[str]:
+    """Return canonical dataset (RP/SEED) or None if unknown."""
+    return _DATASET_ALIASES.get(s.strip().lower())
+def _normalize_skeleton(s: str) -> Optional[str]:
+    """Return canonical skeleton (SOMA/SMPLX/G1) or None if unknown."""
+    return _SKELETON_ALIASES.get(s.strip().lower())
+def _get_latest_for_family_skeleton_dataset(family: str, skeleton: str, dataset: str) -> Optional[ModelInfo]:
+    """Return the model info with the highest version for (family, skeleton, dataset)."""
+    candidates = [
+        info for info in MODEL_INFOS if info.family == family and info.skeleton == skeleton and info.dataset == dataset
+    ]
+    if not candidates:
+        return None
+    def version_key(info: ModelInfo) -> int:
+        v = info.version
+        if v.startswith("v") and v[1:].isdigit():
+            return int(v[1:])
+        return 0
+    return max(candidates, key=version_key)
+def kimodo_short_key_for_skeleton_dataset(skeleton: str, dataset: str) -> Optional[str]:
+    """Return the latest Kimodo model short_key for ``skeleton`` and ``dataset`` (RP/SEED), or
+    None."""
+    info = _get_latest_for_family_skeleton_dataset("Kimodo", skeleton, dataset)
+    return info.short_key if info is not None else None
+def registry_skeleton_for_joint_count(nb_joints: int) -> str:
+    """Map motion joint count to registry skeleton key (SOMA / SMPLX / G1)."""
+    if nb_joints == 34:
+        return "G1"
+    if nb_joints == 22:
+        return "SMPLX"
+    if nb_joints in (77, 30):
+        return "SOMA"
+    raise ValueError(f"No Kimodo model registered for motion with J={nb_joints}")
+# Optional version: Family-Skeleton-Dataset-vN or Family-Skeleton-Dataset
+_RESOLVE_FULL_PATTERN = re.compile(
+    r"^(Kimodo|TMR|kimodo|tmr)[\-_]" r"([A-Za-z0-9]+)[\-_]" r"(RP|SEED|rp|seed)" r"(?:[\-_]v(\d+))?$",
+    re.IGNORECASE,
+)
+# Partial: Skeleton-Dataset or Skeleton or Dataset (no family)
+_RESOLVE_PARTIAL_PATTERN = re.compile(
+    r"^([A-Za-z0-9]+)(?:[\-_](RP|SEED|rp|seed))?(?:[\-_]v(\d+))?$",
+    re.IGNORECASE,
+)
+def resolve_model_name(name: Optional[str], default_family: Optional[str] = None) -> str:
+    """Resolve a user-facing model name to a short_key.
+    Accepts full names (e.g. Kimodo-SOMA-RP-v1), case-insensitive matching,
+    and partial names with defaults: dataset=RP, skeleton=SOMA, family from
+    default_family (Kimodo for demo/generation, TMR for embed script).
+    Omitted version resolves to the latest for that model.
+    Args:
+        name: User-provided name (can be None or empty).
+        default_family: "Kimodo" or "TMR" when name is empty or omits family.
+    Returns:
+        Short key (e.g. kimodo-soma-rp) for use with load_model / MODEL_NAMES.
+    Raises:
+        ValueError: If name cannot be resolved or default_family is missing when needed.
+    """
+    if name is not None:
+        name = name.strip()
+    if not name:
+        if default_family is None:
+            raise ValueError('Model name is empty; provide a name or set default_family ("Kimodo" or "TMR").')
+        fam = _normalize_family(default_family)
+        if fam is None:
+            raise ValueError(f"default_family must be 'Kimodo' or 'TMR', got {default_family!r}")
+        info = _get_latest_for_family_skeleton_dataset(fam, "SOMA", "RP")
+        if info is None:
+            raise ValueError(f"No model found for {fam}-SOMA-RP. Available: {list(MODEL_NAMES.keys())}")
+        return info.short_key
+    # Exact short_key
+    if name in MODEL_NAMES:
+        return name
+    # Case-insensitive match against short_key or display_name
+    name_lower = name.lower()
+    matches = []
+    for info in MODEL_INFOS:
+        if name_lower == info.short_key.lower():
+            matches.append(info)
+        disp = info.display_name.lower()
+        if name_lower == disp or name_lower == ("nvidia/" + disp):
+            matches.append(info)
+    if len(matches) == 1:
+        return matches[0].short_key
+    if len(matches) > 1:
+        return matches[0].short_key
+    # Parsed full form: Family-Skeleton-Dataset or Family-Skeleton-Dataset-vN
+    m = _RESOLVE_FULL_PATTERN.match(name)
+    if m:
+        fam_raw, skel_raw, ds_raw, ver_num = m.groups()
+        fam = _normalize_family(fam_raw)
+        skel = _normalize_skeleton(skel_raw)
+        ds = _normalize_dataset(ds_raw)
+        if fam is not None and skel is not None and ds is not None:
+            if ver_num is not None:
+                version = f"v{ver_num}"
+                for info in MODEL_INFOS:
+                    if info.family == fam and info.skeleton == skel and info.dataset == ds and info.version == version:
+                        return info.short_key
+            else:
+                info = _get_latest_for_family_skeleton_dataset(fam, skel, ds)
+                if info is not None:
+                    return info.short_key
+    # Parsed partial: Skeleton-Dataset, Skeleton, or Dataset (use default_family)
+    if default_family is not None:
+        m = _RESOLVE_PARTIAL_PATTERN.match(name)
+        if m:
+            tok1, ds_raw, ver_num = m.groups()
+            fam = _normalize_family(default_family)
+            if fam is not None:
+                skel = _normalize_skeleton(tok1)
+                ds_candidate = _normalize_dataset(ds_raw) if ds_raw else None
+                if skel is not None and ds_candidate is not None:
+                    ds = ds_candidate
+                elif skel is not None:
+                    ds = "RP"
+                else:
+                    skel = "SOMA"
+                    ds = _normalize_dataset(tok1) if tok1 else "RP"
+                    if ds is None:
+                        ds = "RP"
+                if ver_num is not None:
+                    version = f"v{ver_num}"
+                    for info in MODEL_INFOS:
+                        if (
+                            info.family == fam
+                            and info.skeleton == skel
+                            and info.dataset == ds
+                            and info.version == version
+                        ):
+                            return info.short_key
+                else:
+                    info = _get_latest_for_family_skeleton_dataset(fam, skel, ds)
+                    if info is not None:
+                        return info.short_key
+        # Single token: skeleton or dataset
+        fam = _normalize_family(default_family)
+        if fam is not None:
+            skel = _normalize_skeleton(name)
+            if skel is not None:
+                info = _get_latest_for_family_skeleton_dataset(fam, skel, "RP")
+                if info is not None:
+                    return info.short_key
+            ds = _normalize_dataset(name)
+            if ds is not None:
+                info = _get_latest_for_family_skeleton_dataset(fam, "SOMA", ds)
+                if info is not None:
+                    return info.short_key
+    raise ValueError(
+        f"Model name {name!r} could not be resolved. "
+        f"Use a short key (e.g. {list(MODEL_NAMES.keys())[:3]}...), "
+        "a full name (e.g. Kimodo-SOMA-RP-v1), or a partial (e.g. SOMA-RP, SOMA) "
+        "with default_family set."
+    )

kimodo/model/text_encoder_api.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Remote text encoder API client (Gradio) for motion generation."""
+import logging
+import numpy as np
+import torch
+from gradio_client import Client
+# Suppress the [httpx] logs (GET requests)
+logging.getLogger("httpx").setLevel(logging.WARNING)
+# Suppress internal gradio_client logs
+logging.getLogger("gradio_client").setLevel(logging.WARNING)
+class TextEncoderAPI:
+    """Text encoder API client for motion generation."""
+    def __init__(self, url: str):
+        self.client = Client(url, verbose=False)
+        self.device = "cpu"
+        self.dtype = torch.float
+    def _create_np_random_name(self):
+        import uuid
+        return str(uuid.uuid4()) + ".npy"
+    def to(self, device=None, dtype=None):
+        if device is not None:
+            self.device = device
+        if dtype is not None:
+            self.dtype = dtype
+        return self
+    def __call__(self, texts):
+        """Encode text prompts into tensors.
+        Args:
+            texts (str | list[str]): text prompts to encode
+        Returns:
+            tuple[torch.Tensor, list[int]]: encoded text tensors and their lengths
+        """
+        if isinstance(texts, str):
+            texts = [texts]
+        tensors = []
+        lengths = []
+        for text in texts:
+            filename = self._create_np_random_name()
+            # Use a long result timeout to tolerate text-encoder cold-start (LLM2Vec model load ~60-120s).
+            result = self.client.submit(
+                text=text,
+                filename=filename,
+                api_name="/DemoWrapper",
+            ).result(timeout=300)
+            path = result[0]["value"]
+            tensor = np.load(path)
+            length = tensor.shape[0]
+            tensors.append(tensor)
+            lengths.append(length)
+        padded_tensor = np.zeros((len(lengths), max(lengths), tensors[0].shape[-1]), dtype=tensors[0].dtype)
+        for idx, (tensor, length) in enumerate(zip(tensors, lengths)):
+            padded_tensor[idx, :length] = tensor
+        padded_tensor = torch.from_numpy(padded_tensor)
+        padded_tensor = padded_tensor.to(device=self.device, dtype=self.dtype)
+        return padded_tensor, lengths

kimodo/model/tmr.py ADDED Viewed

	@@ -0,0 +1,382 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""TMR model: encoder, and text-to-motion retrieval head."""
+import contextlib
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.nn as nn
+from einops import repeat
+from torch import Tensor
+from kimodo.model import load_checkpoint_state_dict
+from kimodo.motion_rep.feature_utils import length_to_mask
+from kimodo.sanitize import sanitize_texts
+from kimodo.skeleton import SkeletonBase, build_skeleton
+from kimodo.tools import ensure_batched
+class PositionalEncoding(nn.Module):
+    """Sinusoidal positional encoding for sequences (batch_first optional)."""
+    def __init__(self, d_model, dropout=0.1, max_len=5000, batch_first=False) -> None:
+        super().__init__()
+        self.batch_first = batch_first
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        # Note: have to replace torch.exp() and math.log() with torch.pow()
+        # due to MKL exp() and ln() throws floating point exceptions on certain CPUs
+        div_term = torch.pow(10000.0, -torch.arange(0, d_model, 2).float() / d_model)
+        # div_term = torch.exp(
+        #     torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)
+        # )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer("pe", pe, persistent=False)
+    def forward(self, x: Tensor) -> Tensor:
+        if self.batch_first:
+            x = x + self.pe.permute(1, 0, 2)[:, : x.shape[1], :]
+        else:
+            x = x + self.pe[: x.shape[0], :]
+        return self.dropout(x)
+def load_ckpt(self, ckpt_path):
+    """Load model weights from checkpoint path."""
+    state_dict = load_checkpoint_state_dict(ckpt_path)
+    self.load_state_dict(state_dict)
+class ACTORStyleEncoder(nn.Module):
+    """Motion encoder in ACTOR style: optional motion_rep projection, VAE/MLP tokens, transformer."""
+    def __init__(
+        self,
+        motion_rep: Optional[nn.Module],
+        llm_shape: Optional[Tuple],
+        vae: bool,
+        latent_dim: int = 256,
+        ff_size: int = 1024,
+        num_layers: int = 4,
+        num_heads: int = 4,
+        dropout: float = 0.1,
+        activation: str = "gelu",
+        ckpt_path: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+        self.motion_rep = motion_rep
+        if motion_rep is not None and llm_shape is None:
+            nfeats = motion_rep.motion_rep_dim
+        elif motion_rep is None and llm_shape is not None:
+            nfeats = llm_shape[-1]
+        else:
+            raise ValueError
+        self.nfeats = nfeats
+        self.projection = nn.Linear(nfeats, latent_dim)
+        self.vae = vae
+        self.nbtokens = 2 if vae else 1
+        self.tokens = nn.Parameter(torch.randn(self.nbtokens, latent_dim))
+        self.sequence_pos_encoding = PositionalEncoding(latent_dim, dropout=dropout, batch_first=True)
+        seq_trans_encoder_layer = nn.TransformerEncoderLayer(
+            d_model=latent_dim,
+            nhead=num_heads,
+            dim_feedforward=ff_size,
+            dropout=dropout,
+            activation=activation,
+            batch_first=True,
+        )
+        self.seqTransEncoder = nn.TransformerEncoder(
+            seq_trans_encoder_layer,
+            num_layers=num_layers,
+            enable_nested_tensor=False,
+        )
+        if ckpt_path is not None:
+            load_ckpt(self, ckpt_path)
+    def forward(self, x_dict: Dict) -> Tensor:
+        x = x_dict["x"]
+        mask = x_dict["mask"]
+        x = self.projection(x)
+        device = x.device
+        bs = len(x)
+        tokens = repeat(self.tokens, "nbtoken dim -> bs nbtoken dim", bs=bs)
+        xseq = torch.cat((tokens, x), 1)
+        token_mask = torch.ones((bs, self.nbtokens), dtype=bool, device=device)
+        aug_mask = torch.cat((token_mask, mask), 1)
+        # add positional encoding
+        xseq = self.sequence_pos_encoding(xseq)
+        final = self.seqTransEncoder(xseq, src_key_padding_mask=~aug_mask)
+        return final[:, : self.nbtokens]
+class TMR(nn.Module):
+    r"""TMR: Text-to-Motion Retrieval inference code (no decoder)
+    Find more information about the model on the following website:
+    https://mathis.petrovich.fr/tmr
+    """
+    @classmethod
+    def from_args(
+        cls,
+        motion_rep: nn.Module,
+        llm_shape: tuple | list,
+        vae: bool,
+        latent_dim: int = 256,
+        ff_size: int = 1024,
+        num_layers: int = 4,
+        num_heads: int = 4,
+        dropout: float = 0.1,
+        activation: str = "gelu",
+        ckpt_folder: Optional[str] = None,
+        device: Optional[str] = None,
+        **kwargs,
+    ):
+        motion_encoder, top_text_encoder = None, None
+        motion_encoder = ACTORStyleEncoder(
+            motion_rep=motion_rep,
+            llm_shape=None,
+            vae=vae,
+            latent_dim=latent_dim,
+            ff_size=ff_size,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            dropout=dropout,
+            activation=activation,
+            ckpt_path=Path(ckpt_folder) / "motion_encoder.pt",
+        ).to(device)
+        top_text_encoder = ACTORStyleEncoder(
+            motion_rep=None,
+            llm_shape=llm_shape,
+            vae=vae,
+            latent_dim=latent_dim,
+            ff_size=ff_size,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            dropout=dropout,
+            activation=activation,
+            ckpt_path=Path(ckpt_folder) / "text_encoder.pt",
+        ).to(device)
+        return cls(
+            motion_encoder,
+            top_text_encoder,
+            vae,
+            device=device,
+            **kwargs,
+        )
+    def __init__(
+        self,
+        motion_encoder: nn.Module,
+        top_text_encoder: nn.Module,
+        vae: bool,
+        text_encoder: Optional = None,
+        fact: Optional[float] = None,
+        sample_mean: Optional[bool] = True,
+        unit_vector: Optional[bool] = False,
+        compute_grads: bool = False,
+        device: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+        self.motion_encoder = motion_encoder
+        self.text_encoder = top_text_encoder
+        self.raw_text_encoder = text_encoder
+        self.motion_rep = None
+        self.skeleton = None
+        if self.motion_encoder is not None:
+            self.motion_rep = self.motion_encoder.motion_rep
+        if self.motion_rep is not None:
+            self.skeleton = self.motion_rep.skeleton
+        self.compute_grads = compute_grads
+        self.device = device
+        # sampling parameters
+        self.vae = vae
+        self.fact = fact if fact is not None else 1.0
+        self.sample_mean = sample_mean
+        self.unit_vector = unit_vector
+    def full_text_encoder(self, texts: list[str]):
+        assert isinstance(texts, list), "The input should be batched."
+        # sanitize the texts first
+        # then encode the text, and then use the top text encoder
+        texts = sanitize_texts(texts)
+        text_feat, text_length = self.raw_text_encoder(texts)
+        if isinstance(text_length, list):
+            text_length = torch.tensor(text_length, device=self.device)
+        else:
+            text_length = text_length.to(self.device)
+        inputs = {
+            "x": text_feat.to(self.device),
+            "mask": length_to_mask(text_length, device=self.device),
+        }
+        return self.text_encoder(inputs)
+    def _find_encoder(self, inputs, modality):
+        assert modality in ["text", "motion", "raw_text", "auto"]
+        if modality == "text":
+            return self.text_encoder
+        elif modality == "motion":
+            return self.motion_encoder
+        elif modality == "raw_text":
+            return self.full_text_encoder
+        if isinstance(inputs[0], str):
+            return self.full_text_encoder
+        m_nfeats = self.motion_encoder.nfeats
+        t_nfeats = self.text_encoder.nfeats
+        if m_nfeats == t_nfeats:
+            raise ValueError("Cannot automatically find the encoder, as they share the same input space.")
+        nfeats = inputs["x"].shape[-1]
+        if nfeats == m_nfeats:
+            return self.motion_encoder
+        elif nfeats == t_nfeats:
+            return self.text_encoder
+        else:
+            raise ValueError("The inputs is not recognized.")
+    def _encode(
+        self,
+        inputs,
+        modality: str = "auto",
+        sample_mean: Optional[bool] = None,
+        fact: Optional[float] = None,
+        return_distribution: bool = False,
+        unit_vector: Optional[bool] = None,
+    ):
+        sample_mean = self.sample_mean if sample_mean is None else sample_mean
+        fact = self.fact if fact is None else fact
+        unit_vector = self.unit_vector if unit_vector is None else unit_vector
+        # Encode the inputs
+        encoder = self._find_encoder(inputs, modality)
+        encoded = encoder(inputs)
+        # Sampling
+        if self.vae:
+            dists = encoded.unbind(1)
+            mu, logvar = dists
+            if sample_mean:
+                latent_vectors = mu
+            else:
+                # Reparameterization trick
+                std = logvar.exp().pow(0.5)
+                eps = std.data.new(std.size()).normal_()
+                latent_vectors = mu + fact * eps * std
+        else:
+            dists = None
+            (latent_vectors,) = encoded.unbind(1)
+        if unit_vector:
+            latent_vectors = torch.nn.functional.normalize(latent_vectors, dim=-1)
+        if return_distribution:
+            return latent_vectors, dists
+        return latent_vectors
+    @ensure_batched(posed_joints=4, lengths=1)
+    def encode_motion(
+        self,
+        posed_joints: torch.Tensor,
+        original_skeleton: Optional[SkeletonBase] = None,
+        lengths: Optional[torch.Tensor] = None,
+        unit_vector: Optional[bool] = None,
+    ):
+        # TODO here.
+        convert_ctx = torch.no_grad() if not self.compute_grads else contextlib.nullcontext()
+        if original_skeleton is None:
+            original_skeleton = build_skeleton(posed_joints.shape[-2])
+        if lengths is None:
+            nbatch, nbframes = posed_joints.shape[:2]
+            device = posed_joints.device
+            assert nbatch == 1, "If lenghts is not provided, the input should not be batched."
+            lengths = torch.tensor([nbframes], device=device)
+        # slice the posed joints if we use less joints
+        skel_slice = self.motion_rep.skeleton.get_skel_slice(original_skeleton)
+        posed_joints = posed_joints[..., skel_slice, :]
+        with convert_ctx:
+            features = self.motion_rep(
+                posed_joints=posed_joints,
+                to_normalize=True,
+                lengths=lengths,
+            )
+            mask = length_to_mask(lengths, device=features.device)
+            x_dict = {"x": features, "mask": mask}
+            latent_vectors = self._encode(
+                x_dict,
+                modality="motion",
+                unit_vector=unit_vector,
+            )
+        return latent_vectors
+    def encode_text(
+        self,
+        x_dict: Dict,
+        unit_vector: Optional[bool] = None,
+    ):
+        # TODO: make it ensure batched
+        convert_ctx = torch.no_grad() if not self.compute_grads else contextlib.nullcontext()
+        with convert_ctx:
+            latent_vectors = self._encode(
+                x_dict,
+                modality="text",
+                unit_vector=unit_vector,
+            )
+        return latent_vectors
+    def encode_raw_text(
+        self,
+        texts: List[str],
+        unit_vector: Optional[bool] = None,
+    ):
+        is_batched = True
+        if isinstance(texts, str):
+            is_batched = False
+            texts = [texts]
+        convert_ctx = torch.no_grad() if not self.compute_grads else contextlib.nullcontext()
+        with convert_ctx:
+            latent_vectors = self._encode(
+                texts,
+                modality="raw_text",
+                unit_vector=unit_vector,
+            )
+        if not is_batched:
+            latent_vectors = latent_vectors[0]
+        return latent_vectors

kimodo/model/twostage_denoiser.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Two-stage transformer denoiser: root stage then body stage for motion diffusion."""
+import contextlib
+from typing import Optional
+import torch
+from torch import nn
+from .backbone import TransformerEncoderBlock
+from .loading import load_checkpoint_state_dict
+class TwostageDenoiser(nn.Module):
+    """Two-stage denoiser: first predicts global root features, then body features conditioned on local root."""
+    def __init__(
+        self,
+        motion_rep,
+        motion_mask_mode,
+        ckpt_path: Optional[str] = None,
+        **kwargs,
+    ):
+        """Build root and body transformer blocks; optionally load checkpoint from ckpt_path."""
+        super().__init__()
+        self.motion_rep = motion_rep
+        self.motion_mask_mode = motion_mask_mode
+        # it should be a dual motion_rep
+        # and be global by default
+        # global motion_rep as inpnut
+        input_dim = motion_rep.motion_rep_dim
+        will_concatenate = motion_mask_mode == "concat"
+        # stage 1: root only
+        root_input_dim = input_dim * 2 if will_concatenate else input_dim
+        root_output_dim = motion_rep.global_root_dim
+        self.root_model = TransformerEncoderBlock(
+            input_dim=root_input_dim,
+            output_dim=root_output_dim,
+            skeleton=self.motion_rep.skeleton,
+            **kwargs,
+        )
+        # replace the global root by the local root
+        local_motion_rep_dim = input_dim - motion_rep.global_root_dim + motion_rep.local_root_dim
+        # stage 2: local body
+        body_input_dim = local_motion_rep_dim + (
+            input_dim if will_concatenate else 0
+        )  # body stage always takes in local root info for motion (but still the global mask)
+        body_output_dim = input_dim - motion_rep.global_root_dim
+        self.body_model = TransformerEncoderBlock(
+            input_dim=body_input_dim,
+            output_dim=body_output_dim,
+            skeleton=self.motion_rep.skeleton,
+            **kwargs,
+        )
+        if ckpt_path:
+            self.load_ckpt(ckpt_path)
+    def load_ckpt(self, ckpt_path: str) -> None:
+        """Load checkpoint from path; state dict keys are stripped of 'denoiser.backbone.'
+        prefix."""
+        state_dict = load_checkpoint_state_dict(ckpt_path)
+        state_dict = {key.replace("denoiser.backbone.", ""): val for key, val in state_dict.items()}
+        self.load_state_dict(state_dict)
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_pad_mask: torch.Tensor,
+        text_feat: torch.Tensor,
+        text_feat_pad_mask: torch.Tensor,
+        timesteps: torch.Tensor,
+        first_heading_angle: Optional[torch.Tensor] = None,
+        motion_mask: Optional[torch.Tensor] = None,
+        observed_motion: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): [B, T, dim_motion] current noisy motion
+            x_pad_mask (torch.Tensor): [B, T] attention mask, positions with True are allowed to attend, False are not
+            text_feat (torch.Tensor): [B, max_text_len, llm_dim] embedded text prompts
+            text_feat_pad_mask (torch.Tensor): [B, max_text_len] attention mask, positions with True are allowed to attend, False are not
+            timesteps (torch.Tensor): [B,] current denoising step
+            motion_mask
+            observed_motion
+        Returns:
+            torch.Tensor: same size as input x
+        """
+        if self.motion_mask_mode == "concat":
+            if motion_mask is None or observed_motion is None:
+                motion_mask = torch.zeros_like(x)
+                observed_motion = torch.zeros_like(x)
+            x = x * (1 - motion_mask) + observed_motion * motion_mask
+            x_extended = torch.cat([x, motion_mask], axis=-1)
+        else:
+            x_extended = x
+        # Stage 1: predict root motion in global
+        root_motion_pred = self.root_model(
+            x_extended,
+            x_pad_mask,
+            text_feat,
+            text_feat_pad_mask,
+            timesteps,
+            first_heading_angle,
+        )  # [B, T, 5]
+        # Maybe pass this as argument instead of recomputing it
+        lengths = x_pad_mask.sum(-1)
+        # Convert root pred to local rep
+        # At test-time want to allow gradient through for guidance
+        convert_ctx = torch.no_grad() if self.training else contextlib.nullcontext()
+        with convert_ctx:
+            root_motion_local = self.motion_rep.global_root_to_local_root(
+                root_motion_pred,
+                normalized=True,
+                lengths=lengths,
+            )
+        if self.training:
+            root_motion_local = root_motion_local.detach()
+        # concatenate the predicted local root with the body motion
+        body_x = x[..., self.motion_rep.body_slice]
+        x_new = torch.cat([root_motion_local, body_x], axis=-1)
+        if self.motion_mask_mode == "concat":
+            x_new_extended = torch.cat([x_new, motion_mask], axis=-1)
+        else:
+            x_new_extended = x_new
+        # Stage 2: predict local body motion based on local root
+        predicted_body = self.body_model(
+            x_new_extended,
+            x_pad_mask,
+            text_feat,
+            text_feat_pad_mask,
+            timesteps,
+            first_heading_angle,
+        )
+        # concatenate the predicted local body with the predicted root
+        output = torch.cat([root_motion_pred, predicted_body], axis=-1)
+        return output

kimodo/motion_rep/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Motion representation utilities."""
+from .reps import KimodoMotionRep, MotionRepBase, TMRMotionRep
+__all__ = [
+    "MotionRepBase",
+    "KimodoMotionRep",
+    "TMRMotionRep",
+]

kimodo/motion_rep/conditioning.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Constraint conditioning: build index and data dicts from constraint sets for the denoiser."""
+from collections import defaultdict
+import torch
+def build_condition_dicts(constraints_lst: list):
+    index_dict = defaultdict(list)
+    data_dict = defaultdict(list)
+    for constraint in constraints_lst:
+        constraint.update_constraints(data_dict, index_dict)
+    return index_dict, data_dict
+def get_unique_index_and_data(indices_lst, data):
+    # unique + sort them by t
+    indices_unique, inverse = torch.unique(indices_lst, dim=0, return_inverse=True)
+    # pick first value for each unique (t, j)
+    first_idx = torch.zeros(indices_unique.size(0), dtype=torch.long, device=inverse.device)
+    first_idx.scatter_(0, inverse, torch.arange(len(inverse), device=inverse.device))
+    assert (indices_lst[first_idx] == indices_unique).all()
+    # get the data
+    indices_lst = indices_lst[first_idx]
+    data = data[first_idx]
+    return indices_lst, data

kimodo/motion_rep/feature_utils.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Motion representation helpers: velocity, heading, masks, and rotation of features."""
+from typing import List, Optional, Union
+import einops
+import torch
+from kimodo.geometry import cont6d_to_matrix, matrix_to_cont6d
+from kimodo.skeleton import SkeletonBase
+from kimodo.tools import ensure_batched
+def diff_angles(angles: torch.Tensor, fps: float) -> torch.Tensor:
+    """Compute frame-to-frame angular differences in radians, scaled by fps.
+    Args:
+        angles: [..., T] batched sequences of rotation angles in radians.
+        fps: Sampling rate used to convert frame differences to per-second rate.
+    Returns:
+        [..., T-1] difference between consecutive angles (rad/s).
+    """
+    cos = torch.cos(angles)
+    sin = torch.sin(angles)
+    cos_diff = cos[..., 1:] * cos[..., :-1] + sin[..., 1:] * sin[..., :-1]
+    sin_diff = sin[..., 1:] * cos[..., :-1] - cos[..., 1:] * sin[..., :-1]
+    # should be close to angles.diff() but more robust
+    # multiply by fps = 1 / dt
+    angles_diff = fps * torch.arctan2(sin_diff, cos_diff)
+    return angles_diff
+@ensure_batched(positions=4, lengths=1)
+def compute_vel_xyz(
+    positions: torch.Tensor,
+    fps: float,
+    lengths: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Compute the velocities from positions: dx/dt. Works with batches. The last velocity is duplicated to keep the same size.
+    Args:
+        positions (torch.Tensor): [..., T, J, 3] xyz positions of a human skeleton
+        fps (float): frame per seconds
+        lengths (Optional[torch.Tensor]): [...] size of each input batched. If not provided, positions should not be batched
+    Returns:
+        velocity (torch.Tensor): [..., T, J, 3] velocities computed from the positions
+    """
+    device = positions.device
+    if lengths is None:
+        assert positions.shape[0] == 1, "If lengths is not provided, the input should not be batched."
+        lengths = torch.tensor([len(positions)], device=device)
+    # useful for indexing
+    range_len = torch.arange(len(lengths))
+    # compute velocities with fps
+    velocity = fps * (positions[:, 1:] - positions[:, :-1])
+    # pading the velocity vector
+    vel_pad = torch.zeros_like(velocity[:, 0])
+    velocity, _ = einops.pack([velocity, vel_pad], "batch * nbjoints dim")
+    # repeat the last velocities
+    # with special care for different lengths with batches
+    velocity[(range_len, lengths - 1)] = velocity[(range_len, lengths - 2)]
+    return velocity
+@ensure_batched(root_rot_angles=2, lengths=1)
+def compute_vel_angle(
+    root_rot_angles: torch.Tensor,
+    fps: float,
+    lengths: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Compute the local root rotation velocity: dtheta/dt.
+    Args:
+        root_rot_angles (torch.Tensor): [..., T] rotation angle (in radian)
+        fps (float): frame per seconds
+        lengths (Optional[torch.Tensor]): [...] size of each input batched. If not provided, root_rot_angles should not be batched
+    Returns:
+        local_root_rot_vel (torch.Tensor): [..., T] local root rotation velocity (in radian/s)
+    """
+    device = root_rot_angles.device
+    if lengths is None:
+        assert root_rot_angles.shape[0] == 1, "If lengths is not provided, the input should not be batched."
+        lengths = torch.tensor([len(root_rot_angles)], device=device)
+    # useful for indexing
+    range_len = torch.arange(len(lengths))
+    local_root_rot_vel = diff_angles(root_rot_angles, fps)
+    pad_rot_vel_angles = torch.zeros_like(root_rot_angles[:, 0])
+    local_root_rot_vel, _ = einops.pack(
+        [local_root_rot_vel, pad_rot_vel_angles],
+        "batch *",
+    )
+    # repeat the last rotation angle
+    # with special care for different lengths with batches
+    local_root_rot_vel[(range_len, lengths - 1)] = local_root_rot_vel[(range_len, lengths - 2)]
+    return local_root_rot_vel
+@ensure_batched(posed_joints=4)
+def compute_heading_angle(posed_joints: torch.Tensor, skeleton: SkeletonBase) -> torch.Tensor:
+    """Compute the heading direction from joint positions using the hip vector.
+    Args:
+        posed_joints: [B, T, J, 3] global joint positions.
+        skeleton: Skeleton instance used to get hip joint indices.
+    Returns:
+        [B] heading angle in radians.
+    """
+    # compute root heading for the sequence from hip positions
+    r_hip, l_hip = skeleton.hip_joint_idx
+    diff = posed_joints[:, :, r_hip] - posed_joints[:, :, l_hip]
+    heading_angle = torch.atan2(diff[..., 2], -diff[..., 0])
+    return heading_angle
+def length_to_mask(
+    length: Union[torch.Tensor, List],
+    max_len: Optional[int] = None,
+    device=None,
+) -> torch.Tensor:
+    """Convert sequence lengths to a boolean validity mask.
+    Args:
+        length: Sequence lengths, either a tensor ``[B]`` or a Python list.
+        max_len: Optional mask width. If omitted, uses ``max(length)``.
+        device: Optional device. When ``length`` is a list, this controls where
+            the new tensor is created.
+    Returns:
+        A boolean tensor of shape ``[B, max_len]`` where ``True`` marks valid
+        timesteps.
+    """
+    if isinstance(length, list):
+        if device is None:
+            device = "cpu"
+        length = torch.tensor(length, device=device)
+    # Use requested device for output; move length if needed so mask and length match
+    if device is not None:
+        target = torch.device(device)
+        if length.device != target:
+            length = length.to(target)
+    device = length.device
+    if max_len is None:
+        max_len = max(length)
+    mask = torch.arange(max_len, device=device).expand(len(length), max_len) < length.unsqueeze(1)
+    return mask
+class RotateFeatures:
+    """Helper that applies a global heading rotation to motion features."""
+    def __init__(self, angle: torch.Tensor):
+        """Precompute 2D and 3D rotation matrices for a batch of angles.
+        Args:
+            angle: Rotation angle(s) in radians, shaped ``[B]``.
+        """
+        self.angle = angle
+        ## Create the necessary rotations matrices
+        cos, sin = torch.cos(angle), torch.sin(angle)
+        one, zero = torch.ones_like(angle), torch.zeros_like(angle)
+        # 2D rotation transposed (sin are -sin)
+        self.corrective_mat_2d_T = torch.stack((cos, sin, -sin, cos), -1).reshape(angle.shape + (2, 2))
+        # 3D rotation on Y axis
+        self.corrective_mat_Y = torch.stack((cos, zero, sin, zero, one, zero, -sin, zero, cos), -1).reshape(
+            angle.shape + (3, 3)
+        )
+        self.corrective_mat_Y_T = self.corrective_mat_Y.transpose(1, 2).contiguous()
+    def rotate_positions(self, positions: torch.Tensor):
+        """Rotate 3D positions around the Y axis."""
+        return positions @ self.corrective_mat_Y_T
+    def rotate_2d_positions(self, positions_2d: torch.Tensor):
+        """Rotate 2D ``(x, z)`` vectors in the ground plane."""
+        return positions_2d @ self.corrective_mat_2d_T
+    def rotate_rotations(self, rotations: torch.Tensor):
+        """Left-multiply global rotation matrices by the heading correction."""
+        # "Rotate" the global rotations
+        # which means add an extra Y rotation after the transform
+        # so at the left R' = R_y R
+        # (since we use the convention x' = R x)
+        # "bik,btdkj->btdij"
+        B, T, J = rotations.shape[:3]
+        BTJ = B * T * J
+        return (
+            self.corrective_mat_Y[:, None, None].expand(B, T, J, 3, 3).reshape(BTJ, 3, 3) @ rotations.reshape(BTJ, 3, 3)
+        ).reshape(B, T, J, 3, 3)
+    def rotate_6d_rotations(self, rotations_6d: torch.Tensor):
+        """Rotate 6D rotation features via matrix conversion."""
+        return matrix_to_cont6d(self.rotate_rotations(cont6d_to_matrix(rotations_6d)))

kimodo/motion_rep/feet.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Foot contact detection from joint positions and velocities."""
+import torch
+from ..tools import ensure_batched
+@ensure_batched(positions=4, velocity=4)
+def foot_detect_from_pos_and_vel(
+    positions: torch.Tensor,
+    velocity: torch.Tensor,
+    skeleton,
+    vel_thres: float,
+    height_thresh: float,
+) -> torch.Tensor:
+    """Compute foot contact labels using heuristics combining joint height and velocities.
+    Args:
+        positions (torch.Tensor): [X, T, J, 3] global joint positions
+        velocity (torch.Tensor): [X, T, J, 3] velocities (already padded correctly), already multiplied by 1 / dt
+        vel_thres (float): threshold for joint velocity
+        height_thresh (float): threshold for joint height
+    Returns:
+        torch.Tensor: [X, T, 4] contact labels for left and right foot joints
+        (heel/toe order follows the skeleton joint index definition), where
+        ``1`` denotes contact.
+    """
+    device = positions.device
+    # Use at most 2 foot joints per side (ankle + toe); SOMA77 defines a
+    # third end-effector (ToeEnd) that SOMA30 and other skeletons omit.
+    fid_l = skeleton.left_foot_joint_idx[:2]
+    fid_r = skeleton.right_foot_joint_idx[:2]
+    velfactor, heightfactor = (
+        torch.tensor([vel_thres, vel_thres], device=device),
+        torch.tensor([height_thresh, height_thresh], device=device),
+    )
+    feet_l_v = torch.linalg.norm(velocity[:, :, fid_l], axis=-1)
+    feet_l_h = positions[:, :, fid_l, 1]
+    feet_l = torch.logical_and(
+        feet_l_v < velfactor,
+        feet_l_h < heightfactor,
+    ).to(positions.dtype)
+    feet_r_v = torch.linalg.norm(velocity[:, :, fid_r], axis=-1)
+    feet_r_h = positions[:, :, fid_r, 1]
+    feet_r = torch.logical_and(
+        feet_r_v < velfactor,
+        feet_r_h < heightfactor,
+    ).to(positions.dtype)
+    foot_contacts = torch.cat((feet_l, feet_r), axis=-1)
+    return foot_contacts

kimodo/motion_rep/reps/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Motion representation implementations: base, Kimodo, and TMR."""
+from .base import MotionRepBase
+from .kimodo_motionrep import KimodoMotionRep
+from .tmr_motionrep import TMRMotionRep
+__all__ = [
+    "MotionRepBase",
+    "KimodoMotionRep",
+    "TMRMotionRep",
+]

kimodo/motion_rep/reps/base.py ADDED Viewed

	@@ -0,0 +1,300 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Base motion representation: feature layout, normalization, and conditioning helpers."""
+import os
+from typing import Optional
+import einops
+import numpy as np
+import torch
+from einops import repeat
+from ...tools import ensure_batched
+from ..conditioning import build_condition_dicts
+from ..feature_utils import compute_vel_angle, compute_vel_xyz
+from ..stats import Stats
+def _require_split_stats_layout(stats_path: str) -> None:
+    """Raise if stats_path does not contain the required global_root, local_root, body subdirs."""
+    subdirs = ("global_root", "local_root", "body")
+    missing = []
+    for name in subdirs:
+        subpath = os.path.join(stats_path, name)
+        mean_path = os.path.join(subpath, "mean.npy")
+        if not os.path.isfile(mean_path):
+            missing.append(f"{subpath}/ (mean.npy)")
+    if missing:
+        raise FileNotFoundError(
+            f"Checkpoint stats must use the split layout with subfolders "
+            f"global_root/, local_root/, and body/ under '{stats_path}'. "
+            f"Missing or incomplete: {', '.join(missing)}. "
+        )
+class MotionRepBase:
+    """Base class for motion representations used in generation and conditioning.
+    Subclasses define:
+    - ``size_dict``: feature blocks and their shapes,
+    - ``last_root_feature``: last entry of the root block,
+    - ``local_root_size_dict``: local-root feature layout,
+    and implement transform-specific methods such as ``__call__``, ``inverse``,
+    ``rotate``, ``translate_2d`` and ``create_conditions``.
+    """
+    def __init__(
+        self,
+        skeleton,
+        fps,
+        stats_path: Optional[str] = None,
+    ):
+        """Initialize feature slicing metadata and optional normalization stats."""
+        self.skeleton = skeleton
+        self.fps = fps
+        self.nbjoints = skeleton.nbjoints
+        self.feature_names = list(self.size_dict.keys())
+        self.ps = list(self.size_dict.values())
+        self.nfeats_dict = {key: val.numel() for key, val in self.size_dict.items()}
+        feats_cumsum = np.cumsum([0] + list(self.nfeats_dict.values())).tolist()
+        self.slice_dict = {key: slice(feats_cumsum[i], feats_cumsum[i + 1]) for i, key in enumerate(self.feature_names)}
+        self.motion_rep_dim = sum(self.nfeats_dict.values())
+        self.root_slice = slice(0, self.slice_dict[self.last_root_feature].stop)
+        self.body_slice = slice(self.root_slice.stop, self.motion_rep_dim)
+        self.body_dim = self.body_slice.stop - self.body_slice.start
+        self.global_root_dim = self.root_slice.stop
+        self.local_root_dim = sum(val.numel() for val in self.local_root_size_dict.values())
+        if stats_path:
+            _require_split_stats_layout(stats_path)
+            self.global_root_stats = Stats(os.path.join(stats_path, "global_root"))
+            self.local_root_stats = Stats(os.path.join(stats_path, "local_root"))
+            self.body_stats = Stats(os.path.join(stats_path, "body"))
+            # self.stats not set; normalize/unnormalize apply per-part below
+    def get_root_pos(self, features: torch.Tensor, fallback_to_smooth: bool = True):
+        """Extract root positions from a feature tensor.
+        Supports both ``root_pos`` and ``smooth_root_pos`` representations.
+        """
+        if "root_pos" in self.slice_dict:
+            return features[..., self.slice_dict["root_pos"]]
+        if "smooth_root_pos" not in self.slice_dict:
+            raise TypeError("This motion rep should have either a root_pos or smooth_root_pos field")
+        if fallback_to_smooth:
+            return features[:, :, self.slice_dict["smooth_root_pos"]]
+        # else compute the root pos from the smooth root and local joints offset
+        smooth_root_pos = features[:, :, self.slice_dict["smooth_root_pos"]].clone()
+        local_joints_positions_flatten = features[..., self.slice_dict["local_joints_positions"]]
+        hips_offset = local_joints_positions_flatten[..., self.skeleton.root_idx : self.skeleton.root_idx + 3]
+        root_pos = torch.stack(
+            [
+                smooth_root_pos[..., 0] + hips_offset[..., 0],
+                smooth_root_pos[..., 1],
+                smooth_root_pos[..., 2] + hips_offset[..., 2],
+            ],
+            axis=-1,
+        )
+        return root_pos
+    @ensure_batched(root_features=3, lengths=1)
+    def global_root_to_local_root(
+        self,
+        root_features: torch.Tensor,
+        normalized: bool,
+        lengths: Optional[torch.Tensor],
+    ):
+        """Convert global root features to local-root motion features.
+        Args:
+            root_features: Root feature tensor containing root position and
+                global heading, shaped ``[B, T, D_root]``.
+            normalized: Whether ``root_features`` are normalized.
+            lengths: Optional valid lengths per sequence.
+        Returns:
+            Tensor ``[B, T, 4]`` with local root rotational velocity, planar
+            velocity, and global root height.
+        """
+        if normalized:
+            root_features = self.global_root_stats.unnormalize(root_features)
+        [root_pos, global_root_heading] = einops.unpack(root_features, self.ps[:2], "batch time *")
+        cos, sin = global_root_heading.unbind(-1)
+        heading_angle = torch.arctan2(sin, cos)
+        local_root_rot_vel = compute_vel_angle(heading_angle, self.fps, lengths=lengths)
+        local_root_vel = compute_vel_xyz(
+            root_pos[..., None, :],
+            self.fps,
+            lengths=lengths,
+        )[..., 0, [0, 2]]
+        global_root_y = root_pos[..., 1]
+        local_root_motion = torch.cat(
+            [
+                local_root_rot_vel[..., None],
+                local_root_vel,
+                global_root_y[..., None],
+            ],
+            axis=-1,
+        )
+        if normalized:
+            local_root_motion = self.local_root_stats.normalize(local_root_motion)
+        return local_root_motion
+    def get_root_heading_angle(self, features: torch.Tensor) -> torch.Tensor:
+        """Compute root heading angle from cosine/sine heading features."""
+        global_root_heading = features[:, :, self.slice_dict["global_root_heading"]]
+        cos, sin = global_root_heading.unbind(-1)
+        return torch.arctan2(sin, cos)
+    @ensure_batched(features=3)
+    def rotate_to(
+        self,
+        features: torch.Tensor,
+        target_angle: torch.Tensor,
+        return_delta_angle=False,
+    ):
+        """Rotate each sequence so frame-0 heading matches ``target_angle``."""
+        # rotate so that the first frame angle is the target
+        # it put the motion_rep to the angle
+        current_first_angle = self.get_root_heading_angle(features)[:, 0]
+        delta_angle = target_angle - current_first_angle
+        rotated_features = self.rotate(features, delta_angle)
+        if return_delta_angle:
+            return rotated_features, delta_angle
+        return rotated_features
+    @ensure_batched(features=3)
+    def rotate_to_zero(
+        self,
+        features: torch.Tensor,
+        return_delta_angle=False,
+    ):
+        """Rotate each sequence so frame-0 heading becomes zero."""
+        target_angle = torch.zeros(len(features), device=features.device)
+        return self.rotate_to(features, target_angle, return_delta_angle=return_delta_angle)
+    @ensure_batched(features=3)
+    def randomize_first_heading(
+        self,
+        features: torch.Tensor,
+        return_delta_angle=False,
+    ) -> torch.Tensor:
+        """Rotate each sequence to a random frame-0 heading."""
+        target_heading_angle = torch.rand(features.shape[0]) * 2 * np.pi
+        return self.rotate_to(
+            features,
+            target_heading_angle,
+            return_delta_angle=return_delta_angle,
+        )
+    @ensure_batched(features=3, target_2d_pos=2)
+    def translate_2d_to(
+        self,
+        features: torch.Tensor,
+        target_2d_pos: torch.Tensor,
+        return_delta_pos: bool = False,
+    ) -> torch.Tensor:
+        """Translate each sequence so frame-0 root ``(x, z)`` matches a target."""
+        root_pos = self.get_root_pos(features)
+        current_first_2d_pos = root_pos[:, 0, [0, 2]].clone()
+        delta_2d_pos = target_2d_pos - current_first_2d_pos
+        translated_features = self.translate_2d(features, delta_2d_pos)
+        if return_delta_pos:
+            return translated_features, delta_2d_pos
+        return translated_features
+    @ensure_batched(features=3)
+    def translate_2d_to_zero(
+        self,
+        features: torch.Tensor,
+        return_delta_pos: bool = False,
+    ) -> torch.Tensor:
+        """Translate each sequence so frame-0 root ``(x, z)`` is at the origin."""
+        target_2d_pos = torch.zeros(len(features), 2, device=features.device)
+        return self.translate_2d_to(features, target_2d_pos, return_delta_pos=return_delta_pos)
+    @ensure_batched(features=3)
+    def canonicalize(self, features: torch.Tensor):
+        """Canonicalize heading and planar position at frame 0."""
+        rotated_features = self.rotate_to_zero(features)
+        return self.translate_2d_to_zero(rotated_features)
+    def normalize(self, features):
+        """Normalize features using per-part stats (global_root, local_root, body)."""
+        gr = slice(0, self.global_root_dim)
+        lr = slice(self.global_root_dim, self.global_root_dim + self.local_root_dim)
+        out = torch.empty_like(features, device=features.device, dtype=features.dtype)
+        out[..., gr] = self.global_root_stats.normalize(features[..., gr])
+        out[..., lr] = self.local_root_stats.normalize(features[..., lr])
+        out[..., self.body_slice] = self.body_stats.normalize(features[..., self.body_slice])
+        return out
+    def unnormalize(self, features):
+        """Undo feature normalization using per-part stats."""
+        gr = slice(0, self.global_root_dim)
+        lr = slice(self.global_root_dim, self.global_root_dim + self.local_root_dim)
+        out = torch.empty_like(features, device=features.device, dtype=features.dtype)
+        out[..., gr] = self.global_root_stats.unnormalize(features[..., gr])
+        out[..., lr] = self.local_root_stats.unnormalize(features[..., lr])
+        out[..., self.body_slice] = self.body_stats.unnormalize(features[..., self.body_slice])
+        return out
+    def create_conditions_from_constraints(
+        self,
+        constraints_lst: list,
+        length: int,
+        to_normalize: bool,
+        device: str,
+    ):
+        """Create a conditioning tensor and mask from constraint objects."""
+        index_dict, data_dict = build_condition_dicts(constraints_lst)
+        return self.create_conditions(index_dict, data_dict, length, to_normalize, device)
+    def create_conditions_from_constraints_batched(
+        self,
+        constraints_lst: list | list[list],
+        lengths: torch.Tensor,
+        to_normalize: bool,
+        device: str,
+    ):
+        """Batched version of ``create_conditions_from_constraints``.
+        Supports either one shared constraint list for all batch elements, or a per-sample list of
+        constraint lists.
+        """
+        num_samples = len(lengths)
+        if not constraints_lst or not isinstance(constraints_lst[0], list):
+            # If no constraints, or constraints are shared across the batch,
+            # build once and repeat.
+            observed_motion, motion_mask = self.create_conditions_from_constraints(
+                constraints_lst, int(lengths.max()), to_normalize, device
+            )
+            observed_motion = repeat(observed_motion, "t d -> b t d", b=num_samples)
+            motion_mask = repeat(motion_mask, "t d -> b t d", b=num_samples)
+            return observed_motion, motion_mask
+        length = int(lengths.max())
+        observed_motion_lst = []
+        motion_mask_lst = []
+        for constraints_lst_el in constraints_lst:
+            observed_motion, motion_mask = self.create_conditions_from_constraints(
+                constraints_lst_el,
+                length,
+                to_normalize,
+                device,
+            )
+            observed_motion_lst.append(observed_motion)
+            motion_mask_lst.append(motion_mask)
+        observed_motion = torch.stack(observed_motion_lst, axis=0)
+        motion_mask = torch.stack(motion_mask_lst, axis=0)
+        return observed_motion, motion_mask

kimodo/motion_rep/reps/kimodo_motionrep.py ADDED Viewed

	@@ -0,0 +1,301 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+import einops
+import torch
+from torch import Tensor
+from kimodo.tools import to_numpy
+from ...geometry import cont6d_to_matrix, matrix_to_cont6d
+from ...skeleton.kinematics import fk
+from ...skeleton.transforms import global_rots_to_local_rots
+from ...tools import ensure_batched
+from ..conditioning import get_unique_index_and_data
+from ..feature_utils import RotateFeatures, compute_heading_angle, compute_vel_xyz
+from ..feet import foot_detect_from_pos_and_vel
+from ..smooth_root import get_smooth_root_pos
+from .base import MotionRepBase
+class KimodoMotionRep(MotionRepBase):
+    """Global root / global joints rotations representation, relative to a smooth root."""
+    def __init__(
+        self,
+        skeleton,
+        fps,
+        stats_path: Optional[str] = None,
+    ):
+        nbjoints = skeleton.nbjoints
+        self.size_dict = {
+            "smooth_root_pos": torch.Size([3]),
+            "global_root_heading": torch.Size([2]),
+            "local_joints_positions": torch.Size([nbjoints, 3]),
+            "global_rot_data": torch.Size([nbjoints, 6]),
+            "velocities": torch.Size([nbjoints, 3]),
+            "foot_contacts": torch.Size([4]),
+        }
+        self.last_root_feature = "global_root_heading"
+        self.local_root_size_dict = {
+            "local_root_rot_vel": torch.Size([1]),
+            "local_root_vel": torch.Size([2]),
+            "global_root_y": torch.Size([1]),
+        }
+        super().__init__(skeleton, fps, stats_path)
+    @ensure_batched(local_joint_rots=5, root_positions=3, lengths=1)
+    def __call__(
+        self,
+        local_joint_rots: torch.Tensor,
+        root_positions: torch.Tensor,
+        to_normalize: bool,
+        lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Convert local rotations and root trajectory into smooth-root features.
+        Args:
+            local_joint_rots: Local joint rotation matrices ``[B, T, J, 3, 3]``.
+            root_positions: Root positions ``[B, T, 3]``.
+            to_normalize: Whether to normalize output features.
+            lengths: Optional valid lengths for variable-length batches.
+        Returns:
+            Motion features with shape ``[B, T, motion_rep_dim]``.
+        """
+        device = local_joint_rots.device
+        if lengths is None:
+            assert local_joint_rots.shape[0] == 1, "If lenghts is not provided, the input should not be batched."
+            lengths = torch.tensor([local_joint_rots.shape[1]], device=device)
+        (
+            global_joints_rots,
+            global_joints_positions,
+            local_joints_positions_origin_is_pelvis,
+        ) = fk(local_joint_rots, root_positions, self.skeleton)
+        root_heading_angle = compute_heading_angle(global_joints_positions, self.skeleton)
+        global_root_heading = torch.stack([torch.cos(root_heading_angle), torch.sin(root_heading_angle)], dim=-1)
+        smooth_root_pos = get_smooth_root_pos(root_positions)
+        hips_offset = root_positions - smooth_root_pos
+        hips_offset[..., 1] = root_positions[..., 1]
+        local_joints_positions = local_joints_positions_origin_is_pelvis + hips_offset[:, :, None]
+        velocities = compute_vel_xyz(global_joints_positions, self.fps, lengths=lengths)
+        foot_contacts = foot_detect_from_pos_and_vel(global_joints_positions, velocities, self.skeleton, 0.15, 0.10)
+        global_rot_data = matrix_to_cont6d(global_joints_rots)
+        features, _ = einops.pack(
+            [
+                smooth_root_pos,
+                global_root_heading,
+                local_joints_positions,
+                global_rot_data,
+                velocities,
+                foot_contacts,
+            ],
+            "batch time *",
+        )
+        if to_normalize:
+            features = self.normalize(features)
+        return features
+    @ensure_batched(features=3, angle=1)
+    def rotate(self, features: torch.Tensor, angle: torch.Tensor):
+        """Rotate root/joint positional and rotational features by heading."""
+        # assume it is not normalized
+        bs = features.shape[0]
+        device = features.device
+        [
+            smooth_root_pos,
+            global_root_heading,
+            local_joints_positions,
+            global_rot_data,
+            velocities,
+            foot_contacts,
+        ] = einops.unpack(features, self.ps, "batch time *")
+        if not isinstance(angle, torch.Tensor):
+            angle = torch.tensor(angle, device=device)
+        if len(angle.shape) == 0:
+            angle = angle.repeat(bs)
+        RF = RotateFeatures(angle)
+        new_features, _ = einops.pack(
+            [
+                RF.rotate_positions(smooth_root_pos),
+                RF.rotate_2d_positions(global_root_heading),
+                RF.rotate_positions(local_joints_positions),
+                RF.rotate_6d_rotations(global_rot_data),
+                RF.rotate_positions(velocities),
+                foot_contacts,
+            ],
+            "batch time *",
+        )
+        return new_features
+    @ensure_batched(features=3, translation_2d=2)
+    def translate_2d(
+        self,
+        features: torch.Tensor,
+        translation_2d: torch.Tensor,
+    ) -> torch.Tensor:
+        """Translate smooth root planar position by ``(dx, dz)``."""
+        # only move on the ground
+        # If we need a translate_3D function, we should not forget to move the local_joints_positions as well
+        bs = features.shape[0]
+        if len(translation_2d.shape) == 1:
+            translation_2d = translation_2d.repeat(bs, 1)
+        new_features = features.clone()
+        new_smooth_root_pos = new_features[:, :, self.slice_dict["smooth_root_pos"]]
+        new_smooth_root_pos[:, :, 0] += translation_2d[:, [0]]
+        new_smooth_root_pos[:, :, 2] += translation_2d[:, [1]]
+        return new_features
+    @ensure_batched(features=3)
+    def inverse(
+        self,
+        features: torch.Tensor,
+        is_normalized: bool,
+        posed_joints_from="rotations",
+        return_numpy: bool = False,
+    ) -> torch.Tensor:
+        """Decode smooth-root features into motion tensors."""
+        assert posed_joints_from in [
+            "rotations",
+            "positions",
+        ], "posed_joints_from should 'rotations' or 'positions'"
+        if is_normalized:
+            features = self.unnormalize(features)
+        [
+            smooth_root_pos,
+            global_root_heading,
+            local_joints_positions,
+            global_rot_data,
+            velocities,
+            foot_contacts,
+        ] = einops.unpack(features, self.ps, "batch time *")
+        global_rot_mats = cont6d_to_matrix(global_rot_data)
+        local_rot_mats = global_rots_to_local_rots(global_rot_mats, self.skeleton)
+        posed_joints_from_pos = local_joints_positions.clone()
+        posed_joints_from_pos[..., 0] += smooth_root_pos[..., None, 0]
+        posed_joints_from_pos[..., 2] += smooth_root_pos[..., None, 2]
+        root_positions = posed_joints_from_pos[..., self.skeleton.root_idx, :]
+        foot_contacts = foot_contacts > 0.5
+        if posed_joints_from == "rotations":
+            _, posed_joints, _ = self.skeleton.fk(
+                local_rot_mats,
+                root_positions,
+            )
+        else:
+            posed_joints = posed_joints_from_pos
+        output_tensor_dict = {
+            "local_rot_mats": local_rot_mats,
+            "global_rot_mats": global_rot_mats,
+            "posed_joints": posed_joints,
+            "root_positions": root_positions,
+            "smooth_root_pos": smooth_root_pos,
+            "foot_contacts": foot_contacts,
+            "global_root_heading": global_root_heading,
+        }
+        if return_numpy:
+            return to_numpy(output_tensor_dict)
+        return output_tensor_dict
+    def create_conditions(
+        self,
+        index_dict: dict[Tensor],
+        data_dict: dict[Tensor],
+        length: int,
+        to_normalize: bool,
+        device: str,
+    ):
+        """Build sparse conditioning tensors for smooth-root representation."""
+        # create empty features and mask to be filled in
+        observed_motion = torch.zeros(length, self.motion_rep_dim, device=device)
+        motion_mask = torch.zeros(length, self.motion_rep_dim, dtype=bool, device=device)
+        def _cat_indices(indices_list: list[Tensor]) -> Tensor:
+            indices = torch.cat([torch.tensor(x) if not isinstance(x, Tensor) else x for x in indices_list])
+            return indices.to(device=device, dtype=torch.long)
+        def _match_obs_dtype(tensor: Tensor) -> Tensor:
+            return tensor.to(device=device, dtype=observed_motion.dtype)
+        if (fname := "smooth_root_2d") in index_dict and index_dict[fname]:
+            indices = _cat_indices(index_dict[fname])
+            indices, smooth_root_2d = get_unique_index_and_data(indices, torch.cat(data_dict[fname]))
+            smooth_root_2d = _match_obs_dtype(smooth_root_2d)
+            f_sliced = observed_motion[:, self.slice_dict["smooth_root_pos"]]
+            f_sliced[indices, 0] = smooth_root_2d[:, 0]
+            f_sliced[indices, 2] = smooth_root_2d[:, 1]
+            m_sliced = motion_mask[:, self.slice_dict["smooth_root_pos"]]
+            m_sliced[indices, 0] = True
+            m_sliced[indices, 2] = True
+        if (fname := "root_y_pos") in index_dict and index_dict[fname]:
+            indices = _cat_indices(index_dict[fname])
+            indices, root_pos_Y = get_unique_index_and_data(indices, torch.cat(data_dict[fname]))
+            root_pos_Y = _match_obs_dtype(root_pos_Y)
+            f_sliced = observed_motion[:, self.slice_dict["smooth_root_pos"]]
+            f_sliced[indices, 1] = root_pos_Y
+            m_sliced = motion_mask[:, self.slice_dict["smooth_root_pos"]]
+            m_sliced[indices, 1] = True
+        if (fname := "global_root_heading") in index_dict and index_dict[fname]:
+            indices = _cat_indices(index_dict[fname])
+            indices, global_root_heading = get_unique_index_and_data(indices, torch.cat(data_dict[fname]))
+            global_root_heading = _match_obs_dtype(global_root_heading)
+            f_sliced = observed_motion[:, self.slice_dict[fname]]
+            f_sliced[indices] = global_root_heading
+            m_sliced = motion_mask[:, self.slice_dict[fname]]
+            m_sliced[indices] = True
+        if (fname := "global_joints_rots") in index_dict and index_dict[fname]:
+            indices_lst = _cat_indices(index_dict[fname])
+            indices_lst, global_joints_rots = get_unique_index_and_data(indices_lst, torch.cat(data_dict[fname]))
+            global_joints_rots = _match_obs_dtype(global_joints_rots)
+            global_rot_data = matrix_to_cont6d(global_joints_rots)
+            f_sliced = observed_motion[:, self.slice_dict["global_rot_data"]]
+            masking = torch.zeros(len(f_sliced) * self.nbjoints, 6, device=device, dtype=bool)
+            masking[indices_lst.T[0] * self.nbjoints + indices_lst.T[1]] = True
+            masking = masking.reshape(len(f_sliced), self.nbjoints * 6)
+            f_sliced[masking] = global_rot_data.flatten()
+            m_sliced = motion_mask[:, self.slice_dict["global_rot_data"]]
+            m_sliced[masking] = True
+        if (fname := "global_joints_positions") in index_dict and index_dict[fname]:
+            indices_lst = _cat_indices(index_dict[fname])
+            indices_lst, global_joints_positions = get_unique_index_and_data(indices_lst, torch.cat(data_dict[fname]))
+            global_joints_positions = _match_obs_dtype(global_joints_positions)
+            T_indices = indices_lst[:, 0].contiguous()
+            _test = motion_mask[T_indices, self.slice_dict["smooth_root_pos"]]
+            if not _test[:, [0, 2]].all():
+                raise ValueError("For constraining global positions, the smooth root should also be constrained.")
+            smooth_root_pos = observed_motion[T_indices, self.slice_dict["smooth_root_pos"]].clone()
+            local_reference = smooth_root_pos.clone()
+            local_reference[..., 1] = 0.0
+            local_joints_positions = global_joints_positions - local_reference
+            f_sliced = observed_motion[:, self.slice_dict["local_joints_positions"]]
+            masking = torch.zeros(len(f_sliced) * self.nbjoints, 3, device=device, dtype=bool)
+            masking[indices_lst.T[0] * self.nbjoints + indices_lst.T[1]] = True
+            masking = masking.reshape(len(f_sliced), self.nbjoints * 3)
+            f_sliced[masking] = local_joints_positions.flatten()
+            m_sliced = motion_mask[:, self.slice_dict["local_joints_positions"]]
+            m_sliced[masking] = True
+        if to_normalize:
+            observed_motion = self.normalize(observed_motion)
+        return observed_motion, motion_mask

kimodo/motion_rep/reps/tmr_motionrep.py ADDED Viewed

	@@ -0,0 +1,222 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""TMR motion representation: global root, global joints, velocities, and foot contacts."""
+from typing import Optional
+import einops
+import torch
+from ...skeleton.kinematics import fk
+from ...tools import ensure_batched, to_numpy
+from ..feature_utils import RotateFeatures, compute_heading_angle, compute_vel_xyz
+from ..feet import foot_detect_from_pos_and_vel
+from .base import MotionRepBase
+class TMRMotionRep(MotionRepBase):
+    """Motion representation with global root and global joint positions.
+    Feature layout:
+    - root position ``(x, y, z)``
+    - root heading as ``(cos(theta), sin(theta))``
+    - local joint positions (root removed, ground-referenced)
+    - global joint velocities
+    - binary foot contacts
+    """
+    def __init__(
+        self,
+        skeleton,
+        fps,
+        stats_path: Optional[str] = None,
+    ):
+        nbjoints = skeleton.nbjoints
+        self.size_dict = {
+            "root_pos": torch.Size([3]),
+            "global_root_heading": torch.Size([2]),
+            "local_joints_positions": torch.Size([nbjoints - 1, 3]),
+            "velocities": torch.Size([nbjoints, 3]),
+            "foot_contacts": torch.Size([4]),
+        }
+        self.last_root_feature = "global_root_heading"
+        self.local_root_size_dict = {
+            "local_root_rot_vel": torch.Size([1]),
+            "local_root_vel": torch.Size([2]),
+            "global_root_y": torch.Size([1]),
+        }
+        super().__init__(skeleton, fps, stats_path)
+    @ensure_batched(local_joint_rots=5, root_positions=3, posed_joints=4, lengths=1)
+    def __call__(
+        self,
+        local_joint_rots: Optional[torch.Tensor] = None,
+        root_positions: Optional[torch.Tensor] = None,
+        posed_joints: Optional[torch.Tensor] = None,
+        *,
+        to_normalize: bool,
+        lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Convert motion inputs to this feature representation.
+        Args:
+            local_joint_rots: Local joint rotation matrices ``[B, T, J, 3, 3]``.
+                Required when ``posed_joints`` is not provided.
+            root_positions: Root translations ``[B, T, 3]``. Required when
+                ``posed_joints`` is not provided.
+            posed_joints: Optional precomputed global joint positions
+                ``[B, T, J, 3]``. If passed, FK is skipped.
+            to_normalize: Whether to normalize output features.
+            lengths: Optional valid lengths for variable-length batches.
+        Returns:
+            Motion features with shape ``[B, T, motion_rep_dim]``.
+        """
+        if posed_joints is not None:
+            device = posed_joints.device
+            nbatch, nbframes, nbjoints = posed_joints.shape[:3]
+        else:
+            device = local_joint_rots.device
+            nbatch, nbframes, nbjoints = local_joint_rots.shape[:3]
+        if lengths is None:
+            assert nbatch == 1, "If lenghts is not provided, the input should not be batched."
+            lengths = torch.tensor([nbframes], device=device)
+        if posed_joints is None:
+            _, global_positions, local_joints_positions_origin_is_pelvis = fk(
+                local_joint_rots, root_positions, self.skeleton
+            )
+        else:
+            global_positions = posed_joints
+            root_positions = posed_joints[:, :, 0]
+            local_joints_positions_origin_is_pelvis = posed_joints - root_positions[:, :, None]
+        root_heading_angle = compute_heading_angle(global_positions, self.skeleton)
+        global_root_heading = torch.stack([torch.cos(root_heading_angle), torch.sin(root_heading_angle)], dim=-1)
+        ground_offset = 0 * root_positions
+        ground_offset[..., 1] = root_positions[..., 1]
+        local_joints_positions = local_joints_positions_origin_is_pelvis[:, :, 1:] + ground_offset[:, :, None]
+        velocities = compute_vel_xyz(global_positions, self.fps, lengths=lengths)
+        foot_contacts = foot_detect_from_pos_and_vel(global_positions, velocities, self.skeleton, 0.15, 0.10)
+        features, _ = einops.pack(
+            [
+                root_positions,
+                global_root_heading,
+                local_joints_positions,
+                velocities,
+                foot_contacts,
+            ],
+            "batch time *",
+        )
+        if to_normalize:
+            features = self.normalize(features)
+        return features
+    @ensure_batched(features=3, angle=1)
+    def rotate(self, features: torch.Tensor, angle: torch.Tensor):
+        """Rotate all spatial features by a heading delta (radians)."""
+        # rotate by the angle
+        # it add the angle to the current features
+        # assume it is not normalized
+        bs = features.shape[0]
+        device = features.device
+        [
+            root_pos,
+            global_root_heading,
+            local_joints_positions,
+            velocities,
+            foot_contacts,
+        ] = einops.unpack(features, self.ps, "batch time *")
+        if not isinstance(angle, torch.Tensor):
+            angle = torch.tensor(angle, device=device)
+        if len(angle.shape) == 0:
+            angle = angle.repeat(bs)
+        RF = RotateFeatures(angle)
+        new_features, _ = einops.pack(
+            [
+                RF.rotate_positions(root_pos),
+                RF.rotate_2d_positions(global_root_heading),
+                RF.rotate_positions(local_joints_positions),
+                RF.rotate_positions(velocities),
+                foot_contacts,
+            ],
+            "batch time *",
+        )
+        return new_features
+    @ensure_batched(features=3, translation_2d=2)
+    def translate_2d(
+        self,
+        features: torch.Tensor,
+        translation_2d: torch.Tensor,
+    ) -> torch.Tensor:
+        """Translate root planar position by ``(dx, dz)``."""
+        # only move on the ground
+        # For 3D, we should not forget to move the local_joints_positions as well
+        bs = features.shape[0]
+        if len(translation_2d.shape) == 1:
+            translation_2d = translation_2d.repeat(bs, 1)
+        new_features = features.clone()
+        new_root_pos = new_features[:, :, self.slice_dict["root_pos"]]
+        new_root_pos[:, :, 0] += translation_2d[:, 0]
+        new_root_pos[:, :, 2] += translation_2d[:, 1]
+        return new_features
+    @ensure_batched(features=3)
+    def inverse(
+        self,
+        features: torch.Tensor,
+        is_normalized: bool,
+        posed_joints_from="positions",
+        return_numpy: bool = False,
+    ) -> torch.Tensor:
+        """Decode features back to a motion dictionary.
+        Args:
+            features: Feature tensor ``[B, T, D]``.
+            is_normalized: Whether input features are normalized.
+            posed_joints_from: Must be ``"positions"`` for this representation.
+            return_numpy: Whether to convert tensors to numpy arrays.
+        Returns:
+            Dictionary containing reconstructed positions and auxiliary data.
+        """
+        assert posed_joints_from == "positions"
+        if is_normalized:
+            features = self.unnormalize(features)
+        [
+            root_positions,
+            global_root_heading,
+            local_joints_positions,
+            velocities,
+            foot_contacts,
+        ] = einops.unpack(features, self.ps, "batch time *")
+        dummy_root = 0 * local_joints_positions[:, :, [0]]
+        posed_joints_from_pos = torch.stack([dummy_root, local_joints_positions], axis=2)
+        posed_joints_from_pos[..., 0] += root_positions[..., None, 0]
+        posed_joints_from_pos[..., 2] += root_positions[..., None, 2]
+        root_positions = posed_joints_from_pos[..., self.skeleton.root_idx, :]
+        foot_contacts = foot_contacts > 0.5
+        posed_joints = posed_joints_from_pos
+        output_tensor_dict = {
+            "local_rot_mats": None,
+            "global_rot_mats": None,
+            "posed_joints": posed_joints,
+            "root_positions": root_positions,
+            "foot_contacts": foot_contacts,
+            "global_root_heading": global_root_heading,
+        }
+        if return_numpy:
+            return to_numpy(output_tensor_dict)
+        return output_tensor_dict

kimodo/motion_rep/smooth_root.py ADDED Viewed

	@@ -0,0 +1,234 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Smooth root trajectory: ADMM-based smoother with margin constraints and get_smooth_root_pos helper."""
+import math
+import numpy as np
+import torch
+from scipy import sparse
+from scipy.sparse.linalg import splu
+from kimodo.tools import ensure_batched
+class TrajectorySmoother:
+    """Modify trajectories to hit target values while respecting soft constraints.
+    This smoother keeps the trajectory close to the original positions while minimizing
+    accelerations. Targets are enforced at specified frames via soft constraints.
+    """
+    def __init__(
+        self,
+        margins,
+        pos_weight=0.0,
+        loop=False,
+        admm_iters=100,
+        alpha_overrelax=1.0,
+        circle_project=False,
+    ):
+        """Initialize the TrajectorySmoother.
+        Args:
+            margins: Array of margin values for each frame.
+                    margins[i] < 0: unconstrained
+                    margins[i] == 0: pinned on this frame
+                    margins[i] > 0: can deviate within the margin
+            pos_weight: Weight for position preservation
+            loop: Whether the trajectory should loop
+            admm_iters: Number of ADMM iterations
+        """
+        self.pos_weight = pos_weight
+        self.admm_iters = admm_iters
+        self.alpha_overrelax = alpha_overrelax
+        self.circle_project = circle_project
+        N = len(margins)
+        # Store margin information as numpy arrays
+        self.margin_vals = margins
+        # Build acceleration matrix A
+        a_data = []
+        a_rows = []
+        a_cols = []
+        for i in range(1, N - 1):
+            scale = 1.0
+            a_data.extend([-scale, 2.0 * scale, -scale])
+            a_rows.extend([i, i, i])
+            a_cols.extend([i - 1, i, i + 1])
+        if loop:
+            # Add periodic accelerations
+            scale = 1.0
+            a_data.extend([-scale, 2.0 * scale, -scale])
+            a_rows.extend([0, 0, 0])
+            a_cols.extend([N - 1, 0, 1])
+            scale = 1.0
+            a_data.extend([-scale, 2.0 * scale, -scale])
+            a_rows.extend([N - 1, N - 1, N - 1])
+            a_cols.extend([N - 2, N - 1, 0])
+        A = sparse.csr_matrix((a_data, (a_rows, a_cols)), shape=(N, N))
+        # Build identity matrix
+        identity_matrix = sparse.eye(N)
+        # Build system matrix M
+        M = pos_weight * identity_matrix + A.T @ A
+        # Calculate ADMM step size
+        diag_max = max(abs(M.diagonal()))
+        self.admm_stepsize = 0.25 * np.sqrt(diag_max)
+        M = M + self.admm_stepsize * identity_matrix
+        self.system_lu = splu(M.tocsc())
+    def smooth(self, targets, x0):
+        """Interpolate between reference positions while satisfying constraints.
+        Args:
+            observations: Target positions for constrained frames (numpy array)
+            ref_positions: Reference positions defining original shape
+                         (numpy array)
+        Returns:
+            Interpolated positions (numpy array)
+        """
+        x_target = targets.copy()
+        x = x0.copy()
+        z = np.zeros_like(x)
+        u = np.zeros_like(x)
+        for _ in range(self.admm_iters):
+            self.z_update(z, x, x_target, u)
+            self.u_update(u, x, z)
+            self.x_update(x, z, u, x_target)
+        return x
+    def x_update(self, x, z, u, x_t):
+        """Update x in the ADMM iteration."""
+        # x = (wp * I + A^T A + p I)^-1 (wp * x_orig + p (z - u))
+        r = self.pos_weight * x_t + self.admm_stepsize * (z - u)
+        x[:] = self.system_lu.solve(r)
+    def z_update(self, z, x, z_t, u):
+        """Update z in the ADMM iteration using vectorized operations."""
+        # Compute the difference from target for all margin locations at once
+        z[:] = x + u - z_t
+        # Check if we need to project back to margin
+        z_diff_norms = np.linalg.norm(z, axis=1)
+        mask = z_diff_norms > self.margin_vals
+        if np.any(mask):
+            scale_factors = self.margin_vals[mask] / z_diff_norms[mask]
+            z[mask] *= scale_factors[:, np.newaxis]
+        # Add back the target
+        z[:] += z_t
+        if self.circle_project:
+            z[:] = z / (np.linalg.norm(z, axis=1, keepdims=True) + 1.0e-6)
+    def u_update(self, u, x, z):
+        """Update u in the ADMM iteration using vectorized operations."""
+        u[:] += self.alpha_overrelax * (x - z)
+def smooth_signal(x, margins, pos_weight=0, alpha_overrelax=1.8, admm_iters=500, circle_project=False):
+    """Multigrid trajectory smoothing with margin constraints.
+    Args:
+        x: Input trajectory ``[T, D]`` as a NumPy array.
+        margins: Allowed radius around each target frame ``[T]``.
+        pos_weight: Weight for staying close to the original signal.
+        alpha_overrelax: ADMM over-relaxation coefficient.
+        admm_iters: ADMM iterations per multigrid level.
+        circle_project: If ``True``, project each vector to the unit sphere.
+    Returns:
+        Smoothed trajectory of shape ``[T, D]``.
+    """
+    x_smoothed = x.copy()
+    x_smoothed[:] = x.mean(axis=0, keepdims=True)
+    # smooth the signal, multigrid style by starting out coarse,
+    # doubling the resolution and repeating until we're at the full
+    # resolution, using the previous result as the initial guess.
+    levels = int(math.floor(math.log2(len(x))))
+    levels = max(levels - 4, 1)
+    stepsize = 2**levels
+    while True:
+        # smooth signals at this level:
+        num_steps = len(x_smoothed[::stepsize])
+        smoother = TrajectorySmoother(
+            margins=margins[::stepsize],
+            pos_weight=pos_weight,
+            alpha_overrelax=alpha_overrelax,
+            admm_iters=admm_iters,
+            circle_project=circle_project,
+        )
+        x_smoothed[::stepsize] = smoother.smooth(x[::stepsize], x_smoothed[::stepsize])
+        # interpolate to next level:
+        next_stepsize = stepsize // 2
+        num_interleaved = len(x_smoothed[next_stepsize::stepsize])
+        if num_interleaved == num_steps:
+            # linearly extrapolate the last value if we have to:
+            x_smoothed[next_stepsize::stepsize][-1] = (
+                x_smoothed[::stepsize][-1] + (x_smoothed[::stepsize][-1] - x_smoothed[::stepsize][-2]) / 2
+            )
+            num_interleaved = num_interleaved - 1
+        # linearly interpolate the remaining values:
+        x_smoothed[next_stepsize::stepsize][:num_interleaved] = (
+            x_smoothed[::stepsize][:-1] + x_smoothed[::stepsize][1:]
+        ) / 2
+        if stepsize == 1:
+            break
+        stepsize //= 2
+    return x_smoothed
+@ensure_batched(hip_translations=3)
+def get_smooth_root_pos(hip_translations):
+    """Smooth root trajectory in the ground plane while preserving height.
+    Args:
+        hip_translations: Root translations ``[B, T, 3]``.
+    Returns:
+        Smoothed root translations ``[B, T, 3]`` where ``x/z`` are smoothed and
+        ``y`` remains unchanged.
+    """
+    root_translations_xz = hip_translations[..., [0, 2]]
+    root_translations_y = hip_translations[..., [1]]
+    batch_size, nframes = root_translations_xz.shape[:2]
+    margins = np.full(root_translations_xz.shape[1], 0.06)
+    root_translations_smoothed_xz = []
+    for batch in range(batch_size):
+        root_translations_smoothed_xz.append(
+            smooth_signal(root_translations_xz[batch].detach().cpu().numpy(), margins)[None]
+        )
+    root_translations_smoothed_xz = torch.tensor(np.concatenate(root_translations_smoothed_xz))
+    root_translations = torch.cat(
+        [
+            root_translations_smoothed_xz.to(root_translations_y.device),
+            root_translations_y,
+        ],
+        dim=-1,
+    )[..., [0, 2, 1]]
+    return root_translations

kimodo/motion_rep/stats.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Feature normalization statistics (mean/std) for motion representations."""
+import logging
+import os
+from typing import Optional
+import numpy as np
+import torch
+log = logging.getLogger(__name__)
+class Stats(torch.nn.Module):
+    """Utility module for feature normalization statistics.
+    Normalization follows:
+    ``(data - mean) / sqrt(std**2 + eps)``
+    """
+    def __init__(
+        self,
+        folder: Optional[str] = None,
+        load: bool = True,
+        eps=1e-05,
+    ):
+        super().__init__()
+        self.folder = folder
+        self.eps = eps
+        if folder is not None and load:
+            self.load()
+    def sliced(self, indices):
+        """Return a new ``Stats`` object containing selected feature indices."""
+        new_stats = Stats(folder=self.folder, load=False, eps=self.eps)
+        new_stats.register_from_tensors(
+            self.mean[..., indices].clone(),
+            self.std[..., indices].clone(),
+        )
+        return new_stats
+    def load(self):
+        """Load ``mean.npy`` and ``std.npy`` from ``self.folder``."""
+        mean_path = os.path.join(self.folder, "mean.npy")
+        std_path = os.path.join(self.folder, "std.npy")
+        if not os.path.exists(mean_path) or not os.path.exists(std_path):
+            raise FileNotFoundError(
+                f"Missing stats files in '{self.folder}'. Expected:\n"
+                f"  - {mean_path}\n"
+                f"  - {std_path}\n\n"
+                "Make sure the checkpoint/stats have been downloaded and are mounted into the container.\n"
+                "If you're using Docker Compose, run it from the repo root so `./:/workspace` mounts the correct directory."
+            )
+        mean = torch.from_numpy(np.load(mean_path))
+        std = torch.from_numpy(np.load(std_path))
+        self.register_from_tensors(mean, std)
+    def register_from_tensors(self, mean: torch.Tensor, std: torch.Tensor):
+        """Register mean/std tensors as non-persistent buffers."""
+        self.register_buffer("mean", mean, persistent=False)
+        self.register_buffer("std", std, persistent=False)
+    def normalize(self, data: torch.Tensor) -> torch.Tensor:
+        """Normalize data using the stored statistics."""
+        mean = self.mean.to(device=data.device, dtype=data.dtype)
+        std = self.std.to(device=data.device, dtype=data.dtype)
+        # adjust std with eps
+        return (data - mean) / torch.sqrt(std**2 + self.eps)
+    def unnormalize(self, data: torch.Tensor) -> torch.Tensor:
+        """Undo normalization using the stored statistics."""
+        mean = self.mean.to(device=data.device, dtype=data.dtype)
+        std = self.std.to(device=data.device, dtype=data.dtype)
+        # adjust std with eps
+        return data * torch.sqrt(std**2 + self.eps) + mean
+    def is_loaded(self):
+        """Return whether statistics are currently available."""
+        return hasattr(self, "mean")
+    def get_dim(self):
+        """Return feature dimensionality."""
+        return self.mean.shape[0]
+    def save(
+        self,
+        folder: Optional[str] = None,
+        mean: Optional[torch.Tensor] = None,
+        std: Optional[torch.Tensor] = None,
+    ):
+        """Save statistics to ``folder`` as ``mean.npy`` and ``std.npy``."""
+        if folder is None:
+            folder = self.folder
+            if folder is None:
+                raise ValueError("No folder to save stats")
+        if mean is None and std is None:
+            try:
+                mean = self.mean.cpu().numpy()
+                std = self.std.cpu().numpy()
+            except AttributeError:
+                raise ValueError("Stats were not loaded")
+        # don't override stats folder
+        os.makedirs(folder, exist_ok=False)
+        np.save(os.path.join(folder, "mean.npy"), mean)
+        np.save(os.path.join(folder, "std.npy"), std)
+    def __eq__(self, other):
+        return (self.mean.cpu() == other.mean.cpu()).all() and (self.std.cpu() == other.std.cpu()).all()
+    # should define a hash value for pytorch, as we defined __eq__
+    def __hash__(self):
+        # Convert mean and std to bytes for a consistent hash value
+        mean_hash = hash(self.mean.detach().cpu().numpy().tobytes())
+        std_hash = hash(self.std.detach().cpu().numpy().tobytes())
+        return hash((mean_hash, std_hash))
+    def __repr__(self):
+        return f'Stats(folder="{self.folder}")'

kimodo/pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""Pipeline utilities for prompt/script to Kimodo generation flows."""
+from .blend_quality import (
+    BlendGuardrailConfig,
+    TransitionSettings,
+    apply_transition_guardrails,
+    harmonize_scene_transitions,
+)
+from .script_to_kimodo import (
+    CharacterKimodoPlan,
+    build_character_plan,
+    generator_request_to_plans,
+    run_multi_character_generation,
+)
+from .scheduler_runtime import SceneScheduleResult, run_scheduled_scene
+__all__ = [
+    "CharacterKimodoPlan",
+    "BlendGuardrailConfig",
+    "TransitionSettings",
+    "apply_transition_guardrails",
+    "harmonize_scene_transitions",
+    "build_character_plan",
+    "generator_request_to_plans",
+    "run_multi_character_generation",
+    "SceneScheduleResult",
+    "run_scheduled_scene",
+]

kimodo/pipeline/blend_quality.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""Card 7 blend quality guardrails for transition blending safety and consistency."""
+from __future__ import annotations
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class TransitionSettings:
+    """Transition settings passed to Kimodo generation."""
+    num_transition_frames: int
+    share_transition: bool
+    percentage_transition_override: float
+@dataclass(frozen=True)
+class BlendGuardrailConfig:
+    """Runtime safety bounds for transition blending."""
+    min_transition_frames: int = 1
+    max_transition_frames: int = 12
+    min_segment_frames_for_share: int = 12
+    max_transition_ratio: float = 0.30
+    max_shared_window_frames: int = 24
+    harmonize_window: int = 2
+def _clamp(value: float, low: float, high: float) -> float:
+    return max(low, min(high, value))
+def apply_transition_guardrails(
+    segment_frames: list[int],
+    policies: list[str],
+    requested: TransitionSettings,
+    *,
+    config: BlendGuardrailConfig = BlendGuardrailConfig(),
+) -> TransitionSettings:
+    """Clamp transition settings to safe ranges for short/long segments.
+    Guardrails avoid transition windows that dominate short segments and reduce blending artifacts
+    for scripted interactions.
+    """
+    if len(segment_frames) < 2:
+        safe_frames = int(_clamp(requested.num_transition_frames, config.min_transition_frames, config.max_transition_frames))
+        return TransitionSettings(
+            num_transition_frames=safe_frames,
+            share_transition=False,
+            percentage_transition_override=0.0,
+        )
+    min_prev = min(segment_frames[:-1])
+    min_next = min(segment_frames[1:])
+    # Keep at least one non-transition frame in the shortest pair.
+    shortest_pair_budget = max(config.min_transition_frames, min(min_prev, min_next) - 1)
+    safe_frames = int(
+        _clamp(
+            requested.num_transition_frames,
+            config.min_transition_frames,
+            min(config.max_transition_frames, shortest_pair_budget),
+        )
+    )
+    has_cut = "cut" in policies
+    can_share = (
+        requested.share_transition
+        and not has_cut
+        and min_prev >= config.min_segment_frames_for_share
+        and min_next >= config.min_segment_frames_for_share
+    )
+    if not can_share:
+        return TransitionSettings(
+            num_transition_frames=safe_frames,
+            share_transition=False,
+            percentage_transition_override=0.0,
+        )
+    safe_pct = _clamp(requested.percentage_transition_override, 0.0, config.max_transition_ratio)
+    # Cap shared overlap by configured hard ceiling and shortest-pair budget.
+    max_pct_from_shared_window = max(0.0, (config.max_shared_window_frames - safe_frames) / max(1, min_prev))
+    max_pct_from_shortest_pair = max(0.0, (shortest_pair_budget - safe_frames) / max(1, min_prev))
+    safe_pct = min(safe_pct, max_pct_from_shared_window, max_pct_from_shortest_pair)
+    return TransitionSettings(
+        num_transition_frames=safe_frames,
+        share_transition=True,
+        percentage_transition_override=float(safe_pct),
+    )
+def harmonize_scene_transitions(
+    settings_by_character: dict[str, TransitionSettings],
+    *,
+    config: BlendGuardrailConfig = BlendGuardrailConfig(),
+) -> dict[str, TransitionSettings]:
+    """Nudge transition-frame counts toward a scene median for multi-character consistency."""
+    if len(settings_by_character) < 2:
+        return settings_by_character
+    frame_values = sorted(setting.num_transition_frames for setting in settings_by_character.values())
+    median = frame_values[len(frame_values) // 2]
+    low = max(config.min_transition_frames, median - config.harmonize_window)
+    high = min(config.max_transition_frames, median + config.harmonize_window)
+    harmonized: dict[str, TransitionSettings] = {}
+    for character_id, setting in settings_by_character.items():
+        harmonized[character_id] = TransitionSettings(
+            num_transition_frames=int(_clamp(setting.num_transition_frames, low, high)),
+            share_transition=setting.share_transition,
+            percentage_transition_override=setting.percentage_transition_override,
+        )
+    return harmonized

kimodo/pipeline/scheduler_runtime.py ADDED Viewed

	@@ -0,0 +1,139 @@

+"""Card 8 runtime orchestration: deterministic multi-character scheduling."""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass
+from typing import Any, Optional
+from kimodo.pipeline.script_to_kimodo import run_multi_character_generation
+from kimodo.schemas import GeneratorRequest
+from kimodo.scheduler import (
+    CharacterState,
+    CharacterSegmentState,
+    ConflictResolutionPolicy,
+    DeterministicLoop,
+)
+LOGGER = logging.getLogger(__name__)
+@dataclass(frozen=True)
+class SceneScheduleResult:
+    """Structured result for scheduled scene execution."""
+    outputs: dict[str, dict[str, Any]]
+    errors: dict[str, str]
+    plans: dict[str, Any]
+    state_hashes: list[str]
+    interactions: list[tuple[int, str, str]]
+    completed_segments: dict[str, int]
+def _activate_next_segment(loop: DeterministicLoop, character_id: str, plan: Any, segment_index: int) -> None:
+    """Set active segment in loop state for one character."""
+    slot = loop.characters[character_id]
+    slot.segment_state = CharacterSegmentState(
+        character_id=character_id,
+        segment_index=segment_index,
+        frames_elapsed=0,
+        total_frames=plan.num_frames[segment_index],
+    )
+    segment = plan.segment_transition_policies[segment_index]
+    # Interaction target is encoded in planner request segments; set later in per-tick update.
+    slot.current_state = CharacterState.BUSY if segment != "cut" else CharacterState.TRANSITIONING
+def run_scheduled_scene(
+    model: Any,
+    request: GeneratorRequest,
+    *,
+    fps: float,
+    seed: int = 42,
+    conflict_policy: ConflictResolutionPolicy = ConflictResolutionPolicy.COOLDOWN,
+    diffusion_steps: int = 100,
+    cfg_weight: Optional[list[float]] = None,
+    cfg_type: Optional[str] = None,
+    post_processing: bool = True,
+    root_margin: float = 0.04,
+    constraint_resolver: Optional[Any] = None,
+    continue_on_error: bool = False,
+) -> SceneScheduleResult:
+    """Run generation then deterministic timeline scheduling for all characters in a scene."""
+    LOGGER.info("card8.run_scheduled_scene.start scene_id=%s chars=%s", request.scene_id, len(request.characters))
+    outputs, errors, plans = run_multi_character_generation(
+        model,
+        request,
+        fps=fps,
+        diffusion_steps=diffusion_steps,
+        cfg_weight=cfg_weight,
+        cfg_type=cfg_type,
+        post_processing=post_processing,
+        root_margin=root_margin,
+        constraint_resolver=constraint_resolver,
+        continue_on_error=continue_on_error,
+    )
+    loop = DeterministicLoop(
+        fps=int(fps),
+        seed=seed,
+        conflict_policy=conflict_policy,
+    )
+    for priority, character in enumerate(request.characters):
+        loop.register_character(character.character_id, character.skeleton_type, priority=priority)
+    segment_indices = {character.character_id: 0 for character in request.characters}
+    completed_segments = {character.character_id: 0 for character in request.characters}
+    for character in request.characters:
+        plan = plans.get(character.character_id)
+        if plan is None:
+            continue
+        if not plan.num_frames:
+            continue
+        _activate_next_segment(loop, character.character_id, plan, segment_index=0)
+        first_segment = character.segments[0]
+        loop.characters[character.character_id].interaction_target = first_segment.interaction_target
+    total_scene_frames = max((plan.total_frames for plan in plans.values()), default=0)
+    state_hashes: list[str] = []
+    interactions: list[tuple[int, str, str]] = []
+    for _ in range(total_scene_frames):
+        tick = loop.advance_tick({})
+        state_hashes.append(loop.get_state_hash())
+        for winner, loser in tick.interactions:
+            interactions.append((tick.tick_number, winner, loser))
+        for character_id in tick.completed_segments:
+            plan = plans.get(character_id)
+            if plan is None:
+                continue
+            completed_segments[character_id] += 1
+            next_index = segment_indices[character_id] + 1
+            if next_index < len(plan.num_frames):
+                segment_indices[character_id] = next_index
+                _activate_next_segment(loop, character_id, plan, next_index)
+                source_char = next(c for c in request.characters if c.character_id == character_id)
+                loop.characters[character_id].interaction_target = source_char.segments[next_index].interaction_target
+            else:
+                loop.characters[character_id].segment_state = None
+                loop.characters[character_id].interaction_target = None
+    LOGGER.info(
+        "card8.run_scheduled_scene.exit scene_id=%s hashes=%s interactions=%s",
+        request.scene_id,
+        len(state_hashes),
+        len(interactions),
+    )
+    return SceneScheduleResult(
+        outputs=outputs,
+        errors=errors,
+        plans=plans,
+        state_hashes=state_hashes,
+        interactions=interactions,
+        completed_segments=completed_segments,
+    )