| import os |
| import torch |
| import numpy as np |
| from models.llama_model import LLaMAHF, LLaMAHFConfig |
| import models.tae as tae |
| import options.option_transformer as option_trans |
| import warnings |
|
|
| import smplx |
| from utils import bvh, quat |
| from utils.face_z_align_util import rotation_6d_to_matrix, matrix_to_axis_angle, axis_angle_to_quaternion |
|
|
|
|
| warnings.filterwarnings('ignore') |
|
|
| comp_device = torch.device('cuda') |
| |
| args = option_trans.get_args_parser() |
| torch.manual_seed(args.seed) |
|
|
| from sentence_transformers import SentenceTransformer |
| t5_model = SentenceTransformer('sentencet5-xxl/') |
| t5_model.eval() |
| for p in t5_model.parameters(): |
| p.requires_grad = False |
|
|
| def save_motion_as_bvh(motion_data, output_path, fps=30): |
| """ |
| Saves a motion tensor in the 272-dimensional format to a BVH file. |
| This version is adapted from the official repository script for robustness. |
| """ |
| print(f"--- Starting direct conversion to BVH: {os.path.basename(output_path)} ---") |
| try: |
| |
| if isinstance(motion_data, torch.Tensor): |
| motion_data = motion_data.detach().cpu().numpy() |
| |
| |
| if motion_data.ndim == 3 and motion_data.shape[0] == 1: |
| motion_data = motion_data.squeeze(0) |
| elif motion_data.ndim != 2: |
| raise ValueError(f"Input motion data must be 2D or 3D with a batch size of 1, but got shape {motion_data.shape}") |
|
|
| |
| |
| njoint = 22 |
| nfrm, _ = motion_data.shape |
| |
| rotations_matrix = rotation_6d_to_matrix(torch.from_numpy(motion_data[:, 8+6*njoint : 8+12*njoint]).reshape(nfrm, -1, 6)).numpy() |
| |
| |
| global_heading_diff_rot_6d = torch.from_numpy(motion_data[:, 2:8]) |
| global_heading_diff_rot = rotation_6d_to_matrix(global_heading_diff_rot_6d).numpy() |
| global_heading_rot = np.zeros_like(global_heading_diff_rot) |
| global_heading_rot[0] = global_heading_diff_rot[0] |
| for i in range(1, nfrm): |
| global_heading_rot[i] = np.matmul(global_heading_diff_rot[i], global_heading_rot[i-1]) |
|
|
| |
| velocities_root_xy = motion_data[:, :2] |
| positions_no_heading = motion_data[:, 8 : 8+3*njoint].reshape(nfrm, -1, 3) |
| height = positions_no_heading[:, 0, 1] |
| |
| inv_global_heading_rot = np.transpose(global_heading_rot, (0, 2, 1)) |
| rotations_matrix[:, 0, ...] = np.matmul(inv_global_heading_rot, rotations_matrix[:, 0, ...]) |
| |
| velocities_root_xyz = np.zeros((nfrm, 3)) |
| velocities_root_xyz[:, 0] = velocities_root_xy[:, 0] |
| velocities_root_xyz[:, 2] = velocities_root_xy[:, 1] |
| velocities_root_xyz[1:, :] = np.matmul(inv_global_heading_rot[:-1], velocities_root_xyz[1:, :, None]).squeeze(-1) |
| root_translation = np.cumsum(velocities_root_xyz, axis=0) |
| root_translation[:, 1] = height |
|
|
| |
| axis_angle = matrix_to_axis_angle(torch.from_numpy(rotations_matrix)).numpy() |
| poses_85dim = np.concatenate([axis_angle.reshape(nfrm, -1), np.zeros((nfrm, 6)), root_translation, np.zeros((nfrm, 10))], axis=-1) |
|
|
| |
| |
| rots = poses_85dim[:, :72].reshape(-1, 24, 3) |
| trans = poses_85dim[:, 72:75] |
|
|
| |
| model = smplx.create(model_path="body_models/human_model_files", model_type="smpl", gender="NEUTRAL") |
| parents = model.parents.detach().cpu().numpy() |
| rest_pose = model().joints.detach().cpu().numpy().squeeze()[:24,:] |
| offsets = rest_pose - rest_pose[parents] |
| offsets[0] = np.array([0,0,0]) |
| |
| rotations_quat = axis_angle_to_quaternion(torch.from_numpy(rots)).numpy() |
| rotations_euler = np.degrees(quat.to_euler(rotations_quat, order="zyx")) |
| |
| positions = offsets[None].repeat(len(rots), axis=0) |
| positions[:, 0] = trans |
|
|
| joint_names = [ |
| "Pelvis", "Left_hip", "Right_hip", "Spine1", "Left_knee", "Right_knee", "Spine2", |
| "Left_ankle", "Right_ankle", "Spine3", "Left_foot", "Right_foot", "Neck", |
| "Left_collar", "Right_collar", "Head", "Left_shoulder", "Right_shoulder", |
| "Left_elbow", "Right_elbow", "Left_wrist", "Right_wrist", "Left_hand", "Right_hand" |
| ] |
|
|
| |
| bvh.save(output_path, { |
| "rotations": rotations_euler, |
| "positions": positions, |
| "offsets": offsets, |
| "parents": parents, |
| "names": joint_names, |
| "order": "zyx", |
| "frametime": 1.0 / fps, |
| }) |
| print(f"✅ BVH file saved successfully to {output_path}") |
|
|
| except Exception as e: |
| print(f"❌ BVH Conversion Failed. Error: {e}") |
| import traceback |
| traceback.print_exc() |
|
|
|
|
| |
| clip_range = [-30,20] |
|
|
| net = tae.Causal_HumanTAE( |
| hidden_size=args.hidden_size, |
| down_t=args.down_t, |
| stride_t=args.stride_t, |
| depth=args.depth, |
| dilation_growth_rate=args.dilation_growth_rate, |
| activation='relu', |
| latent_dim=args.latent_dim, |
| clip_range=clip_range |
| ) |
|
|
|
|
| config = LLaMAHFConfig.from_name('Normal_size') |
| config.block_size = 78 |
| trans_encoder = LLaMAHF(config, args.num_diffusion_head_layers, args.latent_dim, comp_device) |
|
|
| print('loading checkpoint from {}'.format(args.resume_pth)) |
| ckpt = torch.load(args.resume_pth, map_location='cpu') |
| net.load_state_dict(ckpt['net'], strict=True) |
| net.eval() |
| net.to(comp_device) |
|
|
|
|
| if args.resume_trans is not None: |
| print('loading transformer checkpoint from {}'.format(args.resume_trans)) |
| ckpt = torch.load(args.resume_trans, map_location='cpu') |
| new_ckpt_trans = {} |
| for key in ckpt['trans'].keys(): |
| if key.split('.')[0]=='module': |
| new_key = '.'.join(key.split('.')[1:]) |
| else: |
| new_key = key |
| new_ckpt_trans[new_key] = ckpt['trans'][key] |
| trans_encoder.load_state_dict(new_ckpt_trans, strict=True) |
| trans_encoder.eval() |
| trans_encoder.to(comp_device) |
|
|
|
|
| reference_end_latent = np.load('reference_end_latent_t2m_272.npy') |
| reference_end_latent = torch.from_numpy(reference_end_latent).to(comp_device) |
|
|
| mean = np.load('humanml3d_272/mean_std/Mean.npy') |
| std = np.load('humanml3d_272/mean_std/Std.npy') |
|
|
| |
| threshold = 0.1 |
| cfg_scale = 4.0 |
| print(f"Generating motion with CFG scale: {cfg_scale}") |
| motion_latents = trans_encoder.sample_for_eval_CFG_inference(text=args.text, tokenizer=t5_model, device=comp_device, reference_end_latent=reference_end_latent, threshold=threshold, cfg=cfg_scale) |
|
|
| |
| motion_seqs = net.forward_decoder(motion_latents) |
| from visualization.recover_visualize import recover_from_local_position |
| import visualization.plot_3d_global as plot_3d |
|
|
| motion = motion_seqs.squeeze(0) |
| motion = motion.detach().cpu().numpy() |
|
|
| if not os.path.exists('demo_output'): |
| os.makedirs('demo_output') |
|
|
| if args.mode == 'pos': |
| |
| pred_xyz = recover_from_local_position(motion * std + mean, 22) |
| xyz = pred_xyz.reshape(1, -1, 22, 3) |
| pose_vis = plot_3d.draw_to_batch(xyz, [args.text], [f'demo_output/{args.text}.mp4'], fps=30) |
| print(f"Visualized result is saved in demo_output/{args.text}.mp4") |
|
|
| elif args.mode == 'rot': |
| |
| motion = motion * std + mean |
| |
| |
| output_bvh_path = os.path.join('demo_output', f'{args.text}.bvh') |
| |
| |
| save_motion_as_bvh(motion, output_bvh_path, fps=30) |
| |
| else: |
| raise ValueError(f'Invalid mode: {args.mode}') |
|
|
|
|