Initial upload of MotionStreamer code, excluding large extracted data and output folders.

0e267a7 verified 6 months ago

8.27 kB

	import os
	import torch
	import numpy as np
	from models.llama_model import LLaMAHF, LLaMAHFConfig
	import models.tae as tae
	import options.option_transformer as option_trans
	import warnings

	import smplx
	from utils import bvh, quat
	from utils.face_z_align_util import rotation_6d_to_matrix, matrix_to_axis_angle, axis_angle_to_quaternion


	warnings.filterwarnings('ignore')

	comp_device = torch.device('cuda')
	##### ---- Exp dirs ---- #####
	args = option_trans.get_args_parser()
	torch.manual_seed(args.seed)

	from sentence_transformers import SentenceTransformer
	t5_model = SentenceTransformer('sentencet5-xxl/')
	t5_model.eval()
	for p in t5_model.parameters():
	p.requires_grad = False

	def save_motion_as_bvh(motion_data, output_path, fps=30):
	"""
	Saves a motion tensor in the 272-dimensional format to a BVH file.
	This version is adapted from the official repository script for robustness.
	"""
	print(f"--- Starting direct conversion to BVH: {os.path.basename(output_path)} ---")
	try:
	# --- 1. Ensure data is a 2D NumPy array ---
	if isinstance(motion_data, torch.Tensor):
	motion_data = motion_data.detach().cpu().numpy()

	# This is the key fix: Check dimensions before squeezing
	if motion_data.ndim == 3 and motion_data.shape[0] == 1:
	motion_data = motion_data.squeeze(0)
	elif motion_data.ndim != 2:
	raise ValueError(f"Input motion data must be 2D or 3D with a batch size of 1, but got shape {motion_data.shape}")

	# --- 2. Recover 85-dim SMPL format from 272-dim format ---
	# This logic is from the official script's `recover_from_local_rotation`
	njoint = 22
	nfrm, _ = motion_data.shape

	rotations_matrix = rotation_6d_to_matrix(torch.from_numpy(motion_data[:, 8+6njoint : 8+12njoint]).reshape(nfrm, -1, 6)).numpy()

	# Accumulate heading rotations
	global_heading_diff_rot_6d = torch.from_numpy(motion_data[:, 2:8])
	global_heading_diff_rot = rotation_6d_to_matrix(global_heading_diff_rot_6d).numpy()
	global_heading_rot = np.zeros_like(global_heading_diff_rot)
	global_heading_rot[0] = global_heading_diff_rot[0]
	for i in range(1, nfrm):
	global_heading_rot[i] = np.matmul(global_heading_diff_rot[i], global_heading_rot[i-1])

	# Calculate root translation
	velocities_root_xy = motion_data[:, :2]
	positions_no_heading = motion_data[:, 8 : 8+3*njoint].reshape(nfrm, -1, 3)
	height = positions_no_heading[:, 0, 1]

	inv_global_heading_rot = np.transpose(global_heading_rot, (0, 2, 1))
	rotations_matrix[:, 0, ...] = np.matmul(inv_global_heading_rot, rotations_matrix[:, 0, ...])

	velocities_root_xyz = np.zeros((nfrm, 3))
	velocities_root_xyz[:, 0] = velocities_root_xy[:, 0]
	velocities_root_xyz[:, 2] = velocities_root_xy[:, 1]
	velocities_root_xyz[1:, :] = np.matmul(inv_global_heading_rot[:-1], velocities_root_xyz[1:, :, None]).squeeze(-1)
	root_translation = np.cumsum(velocities_root_xyz, axis=0)
	root_translation[:, 1] = height

	# Convert rotation matrices to axis-angle
	axis_angle = matrix_to_axis_angle(torch.from_numpy(rotations_matrix)).numpy()
	poses_85dim = np.concatenate([axis_angle.reshape(nfrm, -1), np.zeros((nfrm, 6)), root_translation, np.zeros((nfrm, 10))], axis=-1)

	# --- 3. Convert 85-dim SMPL to BVH data ---
	# This logic is from the official script's `smpl2bvh`
	rots = poses_85dim[:, :72].reshape(-1, 24, 3)
	trans = poses_85dim[:, 72:75]

	# Get skeleton from SMPL model
	model = smplx.create(model_path="body_models/human_model_files", model_type="smpl", gender="NEUTRAL")
	parents = model.parents.detach().cpu().numpy()
	rest_pose = model().joints.detach().cpu().numpy().squeeze()[:24,:]
	offsets = rest_pose - rest_pose[parents]
	offsets[0] = np.array([0,0,0])

	rotations_quat = axis_angle_to_quaternion(torch.from_numpy(rots)).numpy()
	rotations_euler = np.degrees(quat.to_euler(rotations_quat, order="zyx"))

	positions = offsets[None].repeat(len(rots), axis=0)
	positions[:, 0] = trans

	joint_names = [
	"Pelvis", "Left_hip", "Right_hip", "Spine1", "Left_knee", "Right_knee", "Spine2",
	"Left_ankle", "Right_ankle", "Spine3", "Left_foot", "Right_foot", "Neck",
	"Left_collar", "Right_collar", "Head", "Left_shoulder", "Right_shoulder",
	"Left_elbow", "Right_elbow", "Left_wrist", "Right_wrist", "Left_hand", "Right_hand"
	]

	# --- 4. Save the final BVH file ---
	bvh.save(output_path, {
	"rotations": rotations_euler,
	"positions": positions,
	"offsets": offsets,
	"parents": parents,
	"names": joint_names,
	"order": "zyx",
	"frametime": 1.0 / fps,
	})
	print(f"✅ BVH file saved successfully to {output_path}")

	except Exception as e:
	print(f"❌ BVH Conversion Failed. Error: {e}")
	import traceback
	traceback.print_exc()


	##### ---- Network ---- #####
	clip_range = [-30,20]

	net = tae.Causal_HumanTAE(
	hidden_size=args.hidden_size,
	down_t=args.down_t,
	stride_t=args.stride_t,
	depth=args.depth,
	dilation_growth_rate=args.dilation_growth_rate,
	activation='relu',
	latent_dim=args.latent_dim,
	clip_range=clip_range
	)


	config = LLaMAHFConfig.from_name('Normal_size')
	config.block_size = 78
	trans_encoder = LLaMAHF(config, args.num_diffusion_head_layers, args.latent_dim, comp_device)

	print('loading checkpoint from {}'.format(args.resume_pth))
	ckpt = torch.load(args.resume_pth, map_location='cpu')
	net.load_state_dict(ckpt['net'], strict=True)
	net.eval()
	net.to(comp_device)


	if args.resume_trans is not None:
	print('loading transformer checkpoint from {}'.format(args.resume_trans))
	ckpt = torch.load(args.resume_trans, map_location='cpu')
	new_ckpt_trans = {}
	for key in ckpt['trans'].keys():
	if key.split('.')[0]=='module':
	new_key = '.'.join(key.split('.')[1:])
	else:
	new_key = key
	new_ckpt_trans[new_key] = ckpt['trans'][key]
	trans_encoder.load_state_dict(new_ckpt_trans, strict=True)
	trans_encoder.eval()
	trans_encoder.to(comp_device)


	reference_end_latent = np.load('reference_end_latent_t2m_272.npy')
	reference_end_latent = torch.from_numpy(reference_end_latent).to(comp_device)

	mean = np.load('humanml3d_272/mean_std/Mean.npy')
	std = np.load('humanml3d_272/mean_std/Std.npy')

	# forward inference
	threshold = 0.1
	cfg_scale = 4.0
	print(f"Generating motion with CFG scale: {cfg_scale}")
	motion_latents = trans_encoder.sample_for_eval_CFG_inference(text=args.text, tokenizer=t5_model, device=comp_device, reference_end_latent=reference_end_latent, threshold=threshold, cfg=cfg_scale)

	# forward decode
	motion_seqs = net.forward_decoder(motion_latents)
	from visualization.recover_visualize import recover_from_local_position
	import visualization.plot_3d_global as plot_3d

	motion = motion_seqs.squeeze(0)
	motion = motion.detach().cpu().numpy()

	if not os.path.exists('demo_output'):
	os.makedirs('demo_output')

	if args.mode == 'pos':
	# Option1: recover from joint position
	pred_xyz = recover_from_local_position(motion * std + mean, 22)
	xyz = pred_xyz.reshape(1, -1, 22, 3)
	pose_vis = plot_3d.draw_to_batch(xyz, [args.text], [f'demo_output/{args.text}.mp4'], fps=30)
	print(f"Visualized result is saved in demo_output/{args.text}.mp4")

	elif args.mode == 'rot':
	# De-normalize the motion data to its original scale
	motion = motion * std + mean

	# Define the output path for the new BVH file
	output_bvh_path = os.path.join('demo_output', f'{args.text}.bvh')

	# Call the new function to save the BVH file directly
	save_motion_as_bvh(motion, output_bvh_path, fps=30)

	else:
	raise ValueError(f'Invalid mode: {args.mode}')