| import os |
| import torch |
| import numpy as np |
| import warnings |
|
|
| import smplx |
|
|
| from models.llama_model import LLaMAHF, LLaMAHFConfig |
| import models.tae as tae |
| import options.option_transformer as option_trans |
| from utils import bvh, quat |
| from utils.face_z_align_util import rotation_6d_to_matrix, matrix_to_axis_angle, axis_angle_to_quaternion |
|
|
| warnings.filterwarnings('ignore') |
|
|
|
|
| class MockTextEncoder: |
| def __init__(self, dim: int = 768): |
| self.dim = dim |
|
|
| def to(self, device): |
| return self |
|
|
| def eval(self): |
| return self |
|
|
| def parameters(self): |
| return [] |
|
|
| def encode(self, text): |
| if isinstance(text, list): |
| batch = len(text) |
| else: |
| batch = 1 |
| text = [text] |
| embeddings = torch.zeros(batch, self.dim) |
| for i, t in enumerate(text): |
| val = hash(t) % self.dim |
| embeddings[i, val] = 1.0 |
| return embeddings.numpy() |
|
|
|
|
| |
| def save_motion_as_bvh(motion_data, output_path, fps=30): |
| print(f"--- Starting direct conversion to BVH: {os.path.basename(output_path)} ---") |
| try: |
| if isinstance(motion_data, torch.Tensor): motion_data = motion_data.detach().cpu().numpy() |
| if motion_data.ndim == 3 and motion_data.shape[0] == 1: motion_data = motion_data.squeeze(0) |
| elif motion_data.ndim != 2: raise ValueError(f"Input motion data must be 2D, but got shape {motion_data.shape}") |
| njoint = 22; nfrm, _ = motion_data.shape |
| rotations_matrix = rotation_6d_to_matrix(torch.from_numpy(motion_data[:, 8+6*njoint : 8+12*njoint]).reshape(nfrm, -1, 6)).numpy() |
| global_heading_diff_rot_6d = torch.from_numpy(motion_data[:, 2:8]) |
| global_heading_diff_rot = rotation_6d_to_matrix(global_heading_diff_rot_6d).numpy() |
| global_heading_rot = np.zeros_like(global_heading_diff_rot); global_heading_rot[0] = global_heading_diff_rot[0] |
| for i in range(1, nfrm): global_heading_rot[i] = np.matmul(global_heading_diff_rot[i], global_heading_rot[i-1]) |
| velocities_root_xy = motion_data[:, :2]; height = motion_data[:, 8 : 8+3*njoint].reshape(nfrm, -1, 3)[:, 0, 1] |
| inv_global_heading_rot = np.transpose(global_heading_rot, (0, 2, 1)); rotations_matrix[:, 0, ...] = np.matmul(inv_global_heading_rot, rotations_matrix[:, 0, ...]) |
| velocities_root_xyz = np.zeros((nfrm, 3)); velocities_root_xyz[:, 0] = velocities_root_xy[:, 0]; velocities_root_xyz[:, 2] = velocities_root_xy[:, 1] |
| velocities_root_xyz[1:, :] = np.matmul(inv_global_heading_rot[:-1], velocities_root_xyz[1:, :, None]).squeeze(-1) |
| root_translation = np.cumsum(velocities_root_xyz, axis=0); root_translation[:, 1] = height |
| axis_angle = matrix_to_axis_angle(torch.from_numpy(rotations_matrix)).numpy().reshape(nfrm, -1); poses_24_joints = np.zeros((nfrm, 72)); poses_24_joints[:, :66] = axis_angle |
| model = smplx.create(model_path="body_models/human_model_files", model_type="smpl", gender="NEUTRAL"); parents = model.parents.detach().cpu().numpy() |
| rest_pose = model().joints.detach().cpu().numpy().squeeze()[:24,:]; offsets = rest_pose - rest_pose[parents]; offsets[0] = np.array([0,0,0]) |
| rotations_quat = axis_angle_to_quaternion(torch.from_numpy(poses_24_joints.reshape(-1, 24, 3))).numpy(); rotations_euler = np.degrees(quat.to_euler(rotations_quat, order="zyx")) |
| positions = np.zeros_like(rotations_quat[..., :3]); positions[:, 0] = root_translation |
| joint_names = ["Pelvis", "Left_hip", "Right_hip", "Spine1", "Left_knee", "Right_knee", "Spine2", "Left_ankle", "Right_ankle", "Spine3", "Left_foot", "Right_foot", "Neck", "Left_collar", "Right_collar", "Head", "Left_shoulder", "Right_shoulder", "Left_elbow", "Right_elbow", "Left_wrist", "Right_wrist", "Left_hand", "Right_hand"] |
| bvh.save(output_path, {"rotations": rotations_euler, "positions": positions, "offsets": offsets, "parents": parents, "names": joint_names, "order": "zyx", "frametime": 1.0 / fps}) |
| print(f"✅ BVH file saved successfully to {output_path}") |
| except Exception as e: |
| print(f"❌ BVH Conversion Failed. Error: {e}"); import traceback; traceback.print_exc() |
|
|
|
|
| def _to_prompt_tensor(embedding: np.ndarray, device: torch.device) -> torch.Tensor: |
| tensor = torch.from_numpy(embedding).float() if isinstance(embedding, np.ndarray) else embedding.float() |
| if tensor.dim() == 1: |
| tensor = tensor.unsqueeze(0) |
| return tensor.to(device) |
|
|
|
|
| def _set_prompt(trans: LLaMAHF, prompt_feat: torch.Tensor) -> None: |
| trans.clear_prompt() |
| trans.set_prompt(prompt_feat) |
|
|
|
|
| def _states_for_prompt(trans: LLaMAHF, latents: torch.Tensor, prompt_feat: torch.Tensor) -> torch.Tensor: |
| _set_prompt(trans, prompt_feat) |
| outputs = trans(latents, feature=None) |
| return outputs[:, :-1, :] |
|
|
|
|
| def _predict_sequence( |
| trans: LLaMAHF, |
| cond_seq: torch.Tensor, |
| uncond_seq: torch.Tensor, |
| cfg_scale: float, |
| temperature: float, |
| ) -> torch.Tensor: |
| batch, seq_len, _ = cond_seq.shape |
| if seq_len == 0: |
| dim = trans.diff_loss.in_channels |
| cond_seq = torch.zeros(batch, 1, trans.config.n_embd, device=cond_seq.device) |
| uncond_seq = torch.zeros_like(cond_seq) |
| seq_len = 1 |
|
|
| mix = torch.cat([cond_seq, uncond_seq], dim=0) |
| flat = mix.reshape(mix.size(0) * seq_len, -1) |
| trans.diff_loss.set_sequence_layout(mix.size(0), seq_len) |
| sampled = trans.diff_loss.sample(flat, temperature=temperature, cfg=cfg_scale) |
|
|
| if cfg_scale != 1.0: |
| cond_flat, _ = sampled.chunk(2, dim=0) |
| else: |
| cond_flat = sampled[: batch * seq_len, :] |
|
|
| target_dim = trans.diff_loss.in_channels |
| return cond_flat.view(batch, seq_len, target_dim) |
|
|
|
|
| def _sample_next_token( |
| trans: LLaMAHF, |
| current_seq: torch.Tensor, |
| latent_dim: int, |
| cond_prompt: torch.Tensor, |
| uncond_prompt: torch.Tensor, |
| temperature: float, |
| cfg_scale: float, |
| device: torch.device, |
| ) -> torch.Tensor: |
| history = current_seq.unsqueeze(0) |
| placeholder = torch.zeros(1, 1, latent_dim, device=device) |
| latents = torch.cat([history, placeholder], dim=1) |
|
|
| cond_seq = _states_for_prompt(trans, latents, cond_prompt) |
| uncond_seq = _states_for_prompt(trans, latents, uncond_prompt) |
| _set_prompt(trans, cond_prompt) |
|
|
| pred_seq = _predict_sequence( |
| trans=trans, |
| cond_seq=cond_seq, |
| uncond_seq=uncond_seq, |
| cfg_scale=cfg_scale, |
| temperature=temperature, |
| ) |
|
|
| new_token = pred_seq[:, -1, :][0] |
| return torch.cat([current_seq, new_token.unsqueeze(0)], dim=0) |
|
|
|
|
| def _refine_sequence( |
| trans: LLaMAHF, |
| sequence: torch.Tensor, |
| frozen_prefix: int, |
| cond_prompt: torch.Tensor, |
| uncond_prompt: torch.Tensor, |
| temperature: float, |
| cfg_scale: float, |
| device: torch.device, |
| ) -> torch.Tensor: |
| total_len = sequence.shape[0] |
| for idx in range(frozen_prefix, total_len): |
| history = sequence[:idx] |
| predicted = _sample_next_token( |
| trans=trans, |
| current_seq=history, |
| latent_dim=sequence.size(1), |
| cond_prompt=cond_prompt, |
| uncond_prompt=uncond_prompt, |
| temperature=temperature, |
| cfg_scale=cfg_scale, |
| device=device, |
| ) |
| sequence[idx] = predicted[-1] |
| return sequence |
|
|
|
|
| def generate_motion_latents( |
| trans: LLaMAHF, |
| initial_tokens: torch.Tensor, |
| latent_dim: int, |
| cond_prompt: torch.Tensor, |
| uncond_prompt: torch.Tensor, |
| num_new_tokens: int, |
| cfg_scale: float, |
| temperature: float, |
| device: torch.device, |
| ) -> torch.Tensor: |
| trans.eval() |
| _set_prompt(trans, cond_prompt) |
|
|
| seq = initial_tokens.clone() |
| for _ in range(num_new_tokens): |
| seq = _sample_next_token( |
| trans=trans, |
| current_seq=seq, |
| latent_dim=latent_dim, |
| cond_prompt=cond_prompt, |
| uncond_prompt=uncond_prompt, |
| temperature=temperature, |
| cfg_scale=cfg_scale, |
| device=device, |
| ) |
|
|
| refined = _refine_sequence( |
| trans=trans, |
| sequence=seq.clone(), |
| frozen_prefix=initial_tokens.shape[0], |
| cond_prompt=cond_prompt, |
| uncond_prompt=uncond_prompt, |
| temperature=temperature, |
| cfg_scale=cfg_scale, |
| device=device, |
| ) |
| return refined |
|
|
|
|
| if __name__ == '__main__': |
| comp_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
| args = option_trans.get_args_parser() |
| torch.manual_seed(args.seed) |
|
|
| |
| print("Loading models for MotionStreamer...") |
| t5_model = MockTextEncoder() |
| t5_model.eval() |
| for p in t5_model.parameters(): |
| p.requires_grad = False |
|
|
| print("Loading Causal TAE (t2m_babel) checkpoint...") |
| tae_net = tae.Causal_HumanTAE( |
| hidden_size=1024, down_t=2, stride_t=2, depth=3, dilation_growth_rate=3, |
| latent_dim=16, clip_range=[-30, 20] |
| ) |
| tae_ckpt = torch.load('Causal_TAE_t2m_babel/net_last.pth', map_location='cpu') |
| tae_net.load_state_dict(tae_ckpt['net'], strict=True) |
| tae_net.eval() |
| tae_net.to(comp_device) |
|
|
| config = LLaMAHFConfig.from_name('Normal_size') |
| trans_encoder = LLaMAHF( |
| config=config, |
| num_diffusion_head_layers=args.num_diffusion_head_layers, |
| input_token_dim=args.latent_dim, |
| device=comp_device, |
| ) |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
|
|
| trans_encoder.eval() |
| trans_encoder.to(comp_device) |
|
|
| |
| print("Loading mean/std from BABEL dataset...") |
| mean = np.load('babel_272/t2m_babel_mean_std/Mean.npy') |
| std = np.load('babel_272/t2m_babel_mean_std/Std.npy') |
|
|
| latent_dim = args.latent_dim |
| motion_history = torch.empty(0, latent_dim, device=comp_device) |
| cfg_scale = 10.0 |
| temperature = 1.3 |
| unit_length = 4 |
| target_tokens = 240 // unit_length |
|
|
| print(f"Generating motion for text: '{args.text}' with CFG scale: {cfg_scale}") |
| text_embedding = _to_prompt_tensor(t5_model.encode(args.text), comp_device) |
| empty_embedding = _to_prompt_tensor(t5_model.encode(''), comp_device) |
| num_new_tokens = max(0, target_tokens - motion_history.shape[0]) |
|
|
| with torch.no_grad(): |
| generated_seq = generate_motion_latents( |
| trans=trans_encoder, |
| initial_tokens=motion_history, |
| latent_dim=latent_dim, |
| cond_prompt=text_embedding, |
| uncond_prompt=empty_embedding, |
| num_new_tokens=num_new_tokens, |
| cfg_scale=cfg_scale, |
| temperature=temperature, |
| device=comp_device, |
| ) |
| motion_latents = generated_seq.unsqueeze(0) |
|
|
| print("Decoding latents to full motion...") |
| motion_seqs = tae_net.forward_decoder(motion_latents) |
|
|
| motion = motion_seqs.detach().cpu().numpy() |
| motion_denormalized = motion * std + mean |
|
|
| output_dir = 'demo_output_streamer' |
| if not os.path.exists(output_dir): os.makedirs(output_dir) |
|
|
| output_bvh_path = os.path.join(output_dir, f'{args.text.replace(" ", "_")}_cfg{cfg_scale}.bvh') |
| save_motion_as_bvh(motion_denormalized, output_bvh_path, fps=30) |
|
|