Astra / scripts /infer_demo.py

Yixuan

update readme

d234621 4 months ago

71.5 kB

	import os
	import sys
	from pathlib import Path
	from typing import Optional

	ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
	sys.path.append(ROOT_DIR)

	import torch
	import torch.nn as nn
	import numpy as np
	from PIL import Image
	import imageio
	import json
	from diffsynth import WanVideoAstraPipeline, ModelManager
	import argparse
	from torchvision.transforms import v2
	from einops import rearrange
	from scipy.spatial.transform import Rotation as R
	import random
	import copy
	from datetime import datetime

	VALID_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg"}
	class InlineVideoEncoder:

	def __init__(self, pipe: WanVideoAstraPipeline, device="cuda"):
	self.device = getattr(pipe, "device", device)
	self.tiler_kwargs = {"tiled": True, "tile_size": (34, 34), "tile_stride": (18, 16)}
	self.frame_process = v2.Compose([
	v2.ToTensor(),
	v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
	])

	self.pipe = pipe

	@staticmethod
	def _crop_and_resize(image: Image.Image) -> Image.Image:
	target_w, target_h = 832, 480
	return v2.functional.resize(
	image,
	(round(target_h), round(target_w)),
	interpolation=v2.InterpolationMode.BILINEAR,
	)

	def preprocess_frame(self, image: Image.Image) -> torch.Tensor:
	image = image.convert("RGB")
	image = self._crop_and_resize(image)
	return self.frame_process(image)

	def load_video_frames(self, video_path: Path) -> Optional[torch.Tensor]:
	reader = imageio.get_reader(str(video_path))
	frames = []
	for frame_data in reader:
	frame = Image.fromarray(frame_data)
	frames.append(self.preprocess_frame(frame))
	reader.close()

	if not frames:
	return None

	frames = torch.stack(frames, dim=0)
	return rearrange(frames, "T C H W -> C T H W")

	def encode_frames_to_latents(self, frames: torch.Tensor) -> torch.Tensor:
	frames = frames.unsqueeze(0).to(self.device, dtype=torch.bfloat16)
	with torch.no_grad():
	latents = self.pipe.encode_video(frames, **self.tiler_kwargs)[0]

	if latents.dim() == 5 and latents.shape[0] == 1:
	latents = latents.squeeze(0)
	return latents.cpu()

	def image_to_frame_stack(
	image_path: Path,
	encoder: InlineVideoEncoder,
	repeat_count: int = 10
	) -> torch.Tensor:
	"""Repeat a single image into a tensor with specified number of frames, shape [C, T, H, W]"""
	if image_path.suffix.lower() not in VALID_IMAGE_EXTENSIONS:
	raise ValueError(f"Unsupported image format: {image_path.suffix}")

	image = Image.open(str(image_path))
	frame = encoder.preprocess_frame(image)
	frames = torch.stack([frame for _ in range(repeat_count)], dim=0)
	return rearrange(frames, "T C H W -> C T H W")


	def load_or_encode_condition(
	condition_pth_path: Optional[str],
	condition_video: Optional[str],
	condition_image: Optional[str],
	start_frame: int,
	num_frames: int,
	device: str,
	pipe: WanVideoAstraPipeline,
	) -> tuple[torch.Tensor, dict]:
	if condition_pth_path:
	return load_encoded_video_from_pth(condition_pth_path, start_frame, num_frames)

	encoder = InlineVideoEncoder(pipe=pipe, device=device)

	if condition_video:
	video_path = Path(condition_video).expanduser().resolve()
	if not video_path.exists():
	raise FileNotFoundError(f"File not Found: {video_path}")
	frames = encoder.load_video_frames(video_path)
	if frames is None:
	raise ValueError(f"no valid frames in {video_path}")
	elif condition_image:
	image_path = Path(condition_image).expanduser().resolve()
	if not image_path.exists():
	raise FileNotFoundError(f"File not Found: {image_path}")
	frames = image_to_frame_stack(image_path, encoder, repeat_count=10)
	else:
	raise ValueError("condition video or image is needed for video generation.")

	latents = encoder.encode_frames_to_latents(frames)
	encoded_data = {"latents": latents}

	if start_frame + num_frames > latents.shape[1]:
	raise ValueError(
	f"Not enough frames after encoding: requested {start_frame + num_frames}, available {latents.shape[1]}"
	)

	condition_latents = latents[:, start_frame:start_frame + num_frames, :, :]
	return condition_latents, encoded_data



	def compute_relative_pose_matrix(pose1, pose2):
	"""
	Compute relative pose between two consecutive frames, return 3x4 camera matrix [R_rel \| t_rel]

	Args:
	pose1: Camera pose of frame i, shape (7,) array [tx1, ty1, tz1, qx1, qy1, qz1, qw1]
	pose2: Camera pose of frame i+1, shape (7,) array [tx2, ty2, tz2, qx2, qy2, qz2, qw2]

	Returns:
	relative_matrix: 3x4 relative pose matrix,
	first 3 columns are rotation matrix R_rel,
	last column is translation vector t_rel
	"""
	# Separate translation vector and quaternion
	t1 = pose1[:3] # Translation of frame i [tx1, ty1, tz1]
	q1 = pose1[3:] # Quaternion of frame i [qx1, qy1, qz1, qw1]
	t2 = pose2[:3] # Translation of frame i+1
	q2 = pose2[3:] # Quaternion of frame i+1

	# 1. Compute relative rotation matrix R_rel
	rot1 = R.from_quat(q1) # Rotation of frame i
	rot2 = R.from_quat(q2) # Rotation of frame i+1
	rot_rel = rot2 * rot1.inv() # Relative rotation = next frame rotation × inverse of current frame rotation
	R_rel = rot_rel.as_matrix() # Convert to 3x3 matrix

	# 2. Compute relative translation vector t_rel
	R1_T = rot1.as_matrix().T # Transpose of current frame rotation matrix (equivalent to inverse)
	t_rel = R1_T @ (t2 - t1) # Relative translation = R1^T × (t2 - t1)

	# 3. Combine into 3x4 matrix [R_rel \| t_rel]
	relative_matrix = np.hstack([R_rel, t_rel.reshape(3, 1)])

	return relative_matrix

	def load_encoded_video_from_pth(pth_path, start_frame=0, num_frames=10):
	"""Load pre-encoded video data from pth file"""
	print(f"Loading encoded video from {pth_path}")

	encoded_data = torch.load(pth_path, weights_only=False, map_location="cpu")
	full_latents = encoded_data['latents'] # [C, T, H, W]

	print(f"Full latents shape: {full_latents.shape}")
	print(f"Extracting frames {start_frame} to {start_frame + num_frames}")

	if start_frame + num_frames > full_latents.shape[1]:
	raise ValueError(f"Not enough frames: requested {start_frame + num_frames}, available {full_latents.shape[1]}")

	condition_latents = full_latents[:, start_frame:start_frame + num_frames, :, :]
	print(f"Extracted condition latents shape: {condition_latents.shape}")

	return condition_latents, encoded_data

	def compute_relative_pose(pose_a, pose_b, use_torch=False):
	"""Compute relative pose matrix of camera B with respect to camera A"""
	assert pose_a.shape == (4, 4), f"Camera A extrinsic matrix should be (4,4), got {pose_a.shape}"
	assert pose_b.shape == (4, 4), f"Camera B extrinsic matrix should be (4,4), got {pose_b.shape}"

	if use_torch:
	if not isinstance(pose_a, torch.Tensor):
	pose_a = torch.from_numpy(pose_a).float()
	if not isinstance(pose_b, torch.Tensor):
	pose_b = torch.from_numpy(pose_b).float()

	pose_a_inv = torch.inverse(pose_a)
	relative_pose = torch.matmul(pose_b, pose_a_inv)
	else:
	if not isinstance(pose_a, np.ndarray):
	pose_a = np.array(pose_a, dtype=np.float32)
	if not isinstance(pose_b, np.ndarray):
	pose_b = np.array(pose_b, dtype=np.float32)

	pose_a_inv = np.linalg.inv(pose_a)
	relative_pose = np.matmul(pose_b, pose_a_inv)

	return relative_pose


	def replace_dit_model_in_manager():
	"""Replace DiT model class with MoE version"""
	from diffsynth.models.wan_video_dit_moe import WanModelMoe
	from diffsynth.configs.model_config import model_loader_configs

	for i, config in enumerate(model_loader_configs):
	keys_hash, keys_hash_with_shape, model_names, model_classes, model_resource = config

	if 'wan_video_dit' in model_names:
	new_model_names = []
	new_model_classes = []

	for name, cls in zip(model_names, model_classes):
	if name == 'wan_video_dit':
	new_model_names.append(name)
	new_model_classes.append(WanModelMoe)
	print(f"Replaced model class: {name} -> WanModelMoe")
	else:
	new_model_names.append(name)
	new_model_classes.append(cls)

	model_loader_configs[i] = (keys_hash, keys_hash_with_shape, new_model_names, new_model_classes, model_resource)


	def add_framepack_components(dit_model):
	"""Add FramePack related components"""
	if not hasattr(dit_model, 'clean_x_embedder'):
	inner_dim = dit_model.blocks[0].self_attn.q.weight.shape[0]

	class CleanXEmbedder(nn.Module):
	def __init__(self, inner_dim):
	super().__init__()
	self.proj = nn.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
	self.proj_2x = nn.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
	self.proj_4x = nn.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))

	def forward(self, x, scale="1x"):
	if scale == "1x":
	x = x.to(self.proj.weight.dtype)
	return self.proj(x)
	elif scale == "2x":
	x = x.to(self.proj_2x.weight.dtype)
	return self.proj_2x(x)
	elif scale == "4x":
	x = x.to(self.proj_4x.weight.dtype)
	return self.proj_4x(x)
	else:
	raise ValueError(f"Unsupported scale: {scale}")

	dit_model.clean_x_embedder = CleanXEmbedder(inner_dim)
	model_dtype = next(dit_model.parameters()).dtype
	dit_model.clean_x_embedder = dit_model.clean_x_embedder.to(dtype=model_dtype)
	print("Added FramePack clean_x_embedder component")


	def add_moe_components(dit_model, moe_config):
	"""Add MoE related components - corrected version"""
	if not hasattr(dit_model, 'moe_config'):
	dit_model.moe_config = moe_config
	print("Added MoE config to model")
	dit_model.top_k = moe_config.get("top_k", 1)

	# Dynamically add MoE components for each block
	dim = dit_model.blocks[0].self_attn.q.weight.shape[0]
	unified_dim = moe_config.get("unified_dim", 25)
	num_experts = moe_config.get("num_experts", 4)
	from diffsynth.models.wan_video_dit_moe import ModalityProcessor, MultiModalMoE
	dit_model.sekai_processor = ModalityProcessor("sekai", 13, unified_dim)
	dit_model.nuscenes_processor = ModalityProcessor("nuscenes", 8, unified_dim)
	dit_model.openx_processor = ModalityProcessor("openx", 13, unified_dim) # OpenX uses 13-dim input, similar to sekai but handled independently
	dit_model.global_router = nn.Linear(unified_dim, num_experts)


	for i, block in enumerate(dit_model.blocks):
	# MoE network - input unified_dim, output dim
	block.moe = MultiModalMoE(
	unified_dim=unified_dim,
	output_dim=dim, # Output dimension matches transformer block dim
	num_experts=moe_config.get("num_experts", 4),
	top_k=moe_config.get("top_k", 2)
	)

	print(f"Block {i} added MoE component (unified_dim: {unified_dim}, experts: {moe_config.get('num_experts', 4)})")


	def generate_sekai_camera_embeddings_sliding(
	cam_data,
	start_frame,
	initial_condition_frames,
	new_frames,
	total_generated,
	use_real_poses=True,
	direction="left"):
	"""
	Generate camera embeddings for Sekai dataset - sliding window version

	Args:
	cam_data: Dictionary containing Sekai camera extrinsic parameters, key 'extrinsic' corresponds to an N44 numpy array
	start_frame: Current generation start frame index
	initial_condition_frames: Initial condition frame count
	new_frames: Number of new frames to generate this time
	total_generated: Total frames already generated
	use_real_poses: Whether to use real Sekai camera poses
	direction: Camera movement direction, default "left"

	Returns:
	camera_embedding: Torch tensor of shape (M, 3*4 + 1), where M is the total number of generated frames
	"""
	time_compression_ratio = 4

	# Calculate the actual number of camera frames needed for FramePack
	# 1 initial frame + 16 frames 4x + 2 frames 2x + 1 frame 1x + new_frames
	framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames

	if use_real_poses and cam_data is not None and 'extrinsic' in cam_data:
	print("🔧 Using real Sekai camera data")
	cam_extrinsic = cam_data['extrinsic']

	# Ensure generating a sufficiently long camera sequence
	max_needed_frames = max(
	start_frame + initial_condition_frames + new_frames,
	framepack_needed_frames,
	30
	)

	print(f"🔧 Calculating Sekai camera sequence length:")
	print(f" - Basic requirement: {start_frame + initial_condition_frames + new_frames}")
	print(f" - FramePack requirement: {framepack_needed_frames}")
	print(f" - Final generation: {max_needed_frames}")

	relative_poses = []
	for i in range(max_needed_frames):
	# Calculate the position of the current frame in the original sequence
	frame_idx = i * time_compression_ratio
	next_frame_idx = frame_idx + time_compression_ratio

	if next_frame_idx < len(cam_extrinsic):
	cam_prev = cam_extrinsic[frame_idx]
	cam_next = cam_extrinsic[next_frame_idx]
	relative_pose = compute_relative_pose(cam_prev, cam_next)
	relative_poses.append(torch.as_tensor(relative_pose[:3, :]))
	else:
	# Out of range, use zero motion
	print(f"⚠️ Frame {frame_idx} exceeds camera data range, using zero motion")
	relative_poses.append(torch.zeros(3, 4))

	pose_embedding = torch.stack(relative_poses, dim=0)
	pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')

	# Create mask sequence of corresponding length
	mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
	# Mark from start_frame to start_frame+initial_condition_frames as condition
	condition_end = min(start_frame + initial_condition_frames, max_needed_frames)
	mask[start_frame:condition_end] = 1.0

	camera_embedding = torch.cat([pose_embedding, mask], dim=1)
	print(f"🔧 Sekai real camera embedding shape: {camera_embedding.shape}")
	return camera_embedding.to(torch.bfloat16)

	else:
	# Ensure generating a sufficiently long camera sequence
	max_needed_frames = max(
	start_frame + initial_condition_frames + new_frames,
	framepack_needed_frames,
	30)

	print(f"🔧 Generating Sekai synthetic camera frames: {max_needed_frames}")

	CONDITION_FRAMES = initial_condition_frames
	STAGE_1 = new_frames//2
	STAGE_2 = new_frames - STAGE_1

	if direction=="forward":
	print("--------------- FORWARD MODE ---------------")
	relative_poses = []
	for i in range(max_needed_frames):
	if i < CONDITION_FRAMES:
	# Input condition frames default to zero motion camera pose
	pose = np.eye(4, dtype=np.float32)
	elif i < CONDITION_FRAMES+STAGE_1+STAGE_2:
	# Forward
	forward_speed = 0.03

	pose = np.eye(4, dtype=np.float32)
	pose[2, 3] = -forward_speed
	else:
	# The part beyond condition frames and target frames remains stationary
	pose = np.eye(4, dtype=np.float32)

	relative_pose = pose[:3, :]
	relative_poses.append(torch.as_tensor(relative_pose))

	elif direction=="left":
	print("--------------- LEFT TURNING MODE ---------------")
	relative_poses = []
	for i in range(max_needed_frames):
	if i < CONDITION_FRAMES:
	# Input condition frames default to zero motion camera pose
	pose = np.eye(4, dtype=np.float32)
	elif i < CONDITION_FRAMES+STAGE_1+STAGE_2:
	# Left turn
	yaw_per_frame = 0.03

	# Rotation matrix
	cos_yaw = np.cos(yaw_per_frame)
	sin_yaw = np.sin(yaw_per_frame)

	# Forward
	forward_speed = 0.00

	pose = np.eye(4, dtype=np.float32)

	pose[0, 0] = cos_yaw
	pose[0, 2] = sin_yaw
	pose[2, 0] = -sin_yaw
	pose[2, 2] = cos_yaw
	pose[2, 3] = -forward_speed
	else:
	# The part beyond condition frames and target frames remains stationary
	pose = np.eye(4, dtype=np.float32)

	relative_pose = pose[:3, :]
	relative_poses.append(torch.as_tensor(relative_pose))

	elif direction=="right":
	print("--------------- RIGHT TURNING MODE ---------------")
	relative_poses = []
	for i in range(max_needed_frames):
	if i < CONDITION_FRAMES:
	# Input condition frames default to zero motion camera pose
	pose = np.eye(4, dtype=np.float32)
	elif i < CONDITION_FRAMES+STAGE_1+STAGE_2:
	# Right turn
	yaw_per_frame = -0.03

	# Rotation matrix
	cos_yaw = np.cos(yaw_per_frame)
	sin_yaw = np.sin(yaw_per_frame)

	# Forward
	forward_speed = 0.00

	pose = np.eye(4, dtype=np.float32)

	pose[0, 0] = cos_yaw
	pose[0, 2] = sin_yaw
	pose[2, 0] = -sin_yaw
	pose[2, 2] = cos_yaw
	pose[2, 3] = -forward_speed
	else:
	# The part beyond condition frames and target frames remains stationary
	pose = np.eye(4, dtype=np.float32)

	relative_pose = pose[:3, :]
	relative_poses.append(torch.as_tensor(relative_pose))

	elif direction=="forward_left":
	print("--------------- FORWARD LEFT MODE ---------------")
	relative_poses = []
	for i in range(max_needed_frames):
	if i < CONDITION_FRAMES:
	# Input condition frames default to zero motion camera pose
	pose = np.eye(4, dtype=np.float32)
	elif i < CONDITION_FRAMES+STAGE_1+STAGE_2:
	# Left turn
	yaw_per_frame = 0.03

	# Rotation matrix
	cos_yaw = np.cos(yaw_per_frame)
	sin_yaw = np.sin(yaw_per_frame)

	# Forward
	forward_speed = 0.03

	pose = np.eye(4, dtype=np.float32)

	pose[0, 0] = cos_yaw
	pose[0, 2] = sin_yaw
	pose[2, 0] = -sin_yaw
	pose[2, 2] = cos_yaw
	pose[2, 3] = -forward_speed

	else:
	# The part beyond condition frames and target frames remains stationary
	pose = np.eye(4, dtype=np.float32)

	relative_pose = pose[:3, :]
	relative_poses.append(torch.as_tensor(relative_pose))

	elif direction=="forward_right":
	print("--------------- FORWARD RIGHT MODE ---------------")
	relative_poses = []
	for i in range(max_needed_frames):
	if i < CONDITION_FRAMES:
	# Input condition frames default to zero motion camera pose
	pose = np.eye(4, dtype=np.float32)
	elif i < CONDITION_FRAMES+STAGE_1+STAGE_2:
	# Right turn
	yaw_per_frame = -0.03

	# Rotation matrix
	cos_yaw = np.cos(yaw_per_frame)
	sin_yaw = np.sin(yaw_per_frame)

	# Forward
	forward_speed = 0.03

	pose = np.eye(4, dtype=np.float32)

	pose[0, 0] = cos_yaw
	pose[0, 2] = sin_yaw
	pose[2, 0] = -sin_yaw
	pose[2, 2] = cos_yaw
	pose[2, 3] = -forward_speed

	else:
	# The part beyond condition frames and target frames remains stationary
	pose = np.eye(4, dtype=np.float32)

	relative_pose = pose[:3, :]
	relative_poses.append(torch.as_tensor(relative_pose))

	elif direction=="s_curve":
	print("--------------- S CURVE MODE ---------------")
	relative_poses = []
	for i in range(max_needed_frames):
	if i < CONDITION_FRAMES:
	# Input condition frames default to zero motion camera pose
	pose = np.eye(4, dtype=np.float32)
	elif i < CONDITION_FRAMES+STAGE_1:
	# Left turn
	yaw_per_frame = 0.03

	# Rotation matrix
	cos_yaw = np.cos(yaw_per_frame)
	sin_yaw = np.sin(yaw_per_frame)

	# Forward
	forward_speed = 0.03

	pose = np.eye(4, dtype=np.float32)

	pose[0, 0] = cos_yaw
	pose[0, 2] = sin_yaw
	pose[2, 0] = -sin_yaw
	pose[2, 2] = cos_yaw
	pose[2, 3] = -forward_speed

	elif i < CONDITION_FRAMES+STAGE_1+STAGE_2:
	# Right turn
	yaw_per_frame = -0.03

	# Rotation matrix
	cos_yaw = np.cos(yaw_per_frame)
	sin_yaw = np.sin(yaw_per_frame)

	# Forward
	forward_speed = 0.03
	# Slight left drift to maintain inertia
	if i < CONDITION_FRAMES+STAGE_1+STAGE_2//3:
	radius_shift = -0.01
	else:
	radius_shift = 0.00

	pose = np.eye(4, dtype=np.float32)

	pose[0, 0] = cos_yaw
	pose[0, 2] = sin_yaw
	pose[2, 0] = -sin_yaw
	pose[2, 2] = cos_yaw
	pose[2, 3] = -forward_speed
	pose[0, 3] = radius_shift

	else:
	# The part beyond condition frames and target frames remains stationary
	pose = np.eye(4, dtype=np.float32)

	relative_pose = pose[:3, :]
	relative_poses.append(torch.as_tensor(relative_pose))

	elif direction=="left_right":
	print("--------------- LEFT RIGHT MODE ---------------")
	relative_poses = []
	for i in range(max_needed_frames):
	if i < CONDITION_FRAMES:
	# Input condition frames default to zero motion camera pose
	pose = np.eye(4, dtype=np.float32)
	elif i < CONDITION_FRAMES+STAGE_1:
	# Left turn
	yaw_per_frame = 0.03

	# Rotation matrix
	cos_yaw = np.cos(yaw_per_frame)
	sin_yaw = np.sin(yaw_per_frame)

	# Forward
	forward_speed = 0.00

	pose = np.eye(4, dtype=np.float32)

	pose[0, 0] = cos_yaw
	pose[0, 2] = sin_yaw
	pose[2, 0] = -sin_yaw
	pose[2, 2] = cos_yaw
	pose[2, 3] = -forward_speed

	elif i < CONDITION_FRAMES+STAGE_1+STAGE_2:
	# Right turn
	yaw_per_frame = -0.03

	# Rotation matrix
	cos_yaw = np.cos(yaw_per_frame)
	sin_yaw = np.sin(yaw_per_frame)

	# Forward
	forward_speed = 0.00

	pose = np.eye(4, dtype=np.float32)

	pose[0, 0] = cos_yaw
	pose[0, 2] = sin_yaw
	pose[2, 0] = -sin_yaw
	pose[2, 2] = cos_yaw
	pose[2, 3] = -forward_speed

	else:
	# The part beyond condition frames and target frames remains stationary
	pose = np.eye(4, dtype=np.float32)

	relative_pose = pose[:3, :]
	relative_poses.append(torch.as_tensor(relative_pose))

	else:
	raise ValueError(f"Not Defined Direction: {direction}")

	pose_embedding = torch.stack(relative_poses, dim=0)
	pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')

	# Create mask sequence of corresponding length
	mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
	condition_end = min(start_frame + initial_condition_frames + 1, max_needed_frames)
	mask[start_frame:condition_end] = 1.0

	camera_embedding = torch.cat([pose_embedding, mask], dim=1)
	print(f"🔧 Sekai synthetic camera embedding shape: {camera_embedding.shape}")
	return camera_embedding.to(torch.bfloat16)


	def generate_openx_camera_embeddings_sliding(
	encoded_data, start_frame, initial_condition_frames, new_frames, use_real_poses):
	"""Generate camera embeddings for OpenX dataset - sliding window version"""
	time_compression_ratio = 4

	# Calculate the actual number of camera frames needed for FramePack
	framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames

	if use_real_poses and encoded_data is not None and 'cam_emb' in encoded_data and 'extrinsic' in encoded_data['cam_emb']:
	print("🔧 Using OpenX real camera data")
	cam_extrinsic = encoded_data['cam_emb']['extrinsic']

	# Ensure generating a sufficiently long camera sequence
	max_needed_frames = max(
	start_frame + initial_condition_frames + new_frames,
	framepack_needed_frames,
	30
	)

	print(f"🔧 Calculating OpenX camera sequence length:")
	print(f" - Basic requirement: {start_frame + initial_condition_frames + new_frames}")
	print(f" - FramePack requirement: {framepack_needed_frames}")
	print(f" - Final generation: {max_needed_frames}")

	relative_poses = []
	for i in range(max_needed_frames):
	# OpenX uses 4x interval, similar to sekai but handles shorter sequences
	frame_idx = i * time_compression_ratio
	next_frame_idx = frame_idx + time_compression_ratio

	if next_frame_idx < len(cam_extrinsic):
	cam_prev = cam_extrinsic[frame_idx]
	cam_next = cam_extrinsic[next_frame_idx]
	relative_pose = compute_relative_pose(cam_prev, cam_next)
	relative_poses.append(torch.as_tensor(relative_pose[:3, :]))
	else:
	# Out of range, use zero motion
	print(f"⚠️ Frame {frame_idx} exceeds OpenX camera data range, using zero motion")
	relative_poses.append(torch.zeros(3, 4))

	pose_embedding = torch.stack(relative_poses, dim=0)
	pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')

	# Create mask sequence of corresponding length
	mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
	# Mark from start_frame to start_frame + initial_condition_frames as condition
	condition_end = min(start_frame + initial_condition_frames, max_needed_frames)
	mask[start_frame:condition_end] = 1.0

	camera_embedding = torch.cat([pose_embedding, mask], dim=1)
	print(f"🔧 OpenX real camera embedding shape: {camera_embedding.shape}")
	return camera_embedding.to(torch.bfloat16)

	else:
	print("🔧 Using OpenX synthetic camera data")

	max_needed_frames = max(
	start_frame + initial_condition_frames + new_frames,
	framepack_needed_frames,
	30
	)

	print(f"🔧 Generating OpenX synthetic camera frames: {max_needed_frames}")
	relative_poses = []
	for i in range(max_needed_frames):
	# OpenX robot operation motion mode - smaller motion amplitude
	# Simulate fine operation motion of robot arm
	roll_per_frame = 0.02 # Slight roll
	pitch_per_frame = 0.01 # Slight pitch
	yaw_per_frame = 0.015 # Slight yaw
	forward_speed = 0.003 # Slower forward speed

	pose = np.eye(4, dtype=np.float32)

	# Compound rotation - simulate complex motion of robot arm
	# Rotate around X-axis (roll)
	cos_roll = np.cos(roll_per_frame)
	sin_roll = np.sin(roll_per_frame)
	# Rotate around Y-axis (pitch)
	cos_pitch = np.cos(pitch_per_frame)
	sin_pitch = np.sin(pitch_per_frame)
	# Rotate around Z-axis (yaw)
	cos_yaw = np.cos(yaw_per_frame)
	sin_yaw = np.sin(yaw_per_frame)

	# Simplified compound rotation matrix (ZYX order)
	pose[0, 0] = cos_yaw * cos_pitch
	pose[0, 1] = cos_yaw * sin_pitch * sin_roll - sin_yaw * cos_roll
	pose[0, 2] = cos_yaw * sin_pitch * cos_roll + sin_yaw * sin_roll
	pose[1, 0] = sin_yaw * cos_pitch
	pose[1, 1] = sin_yaw * sin_pitch * sin_roll + cos_yaw * cos_roll
	pose[1, 2] = sin_yaw * sin_pitch * cos_roll - cos_yaw * sin_roll
	pose[2, 0] = -sin_pitch
	pose[2, 1] = cos_pitch * sin_roll
	pose[2, 2] = cos_pitch * cos_roll

	# Translation - simulate fine movement of robot operation
	pose[0, 3] = forward_speed * 0.5 # Slight movement in X direction
	pose[1, 3] = forward_speed * 0.3 # Slight movement in Y direction
	pose[2, 3] = -forward_speed # Main movement in Z direction (depth)

	relative_pose = pose[:3, :]
	relative_poses.append(torch.as_tensor(relative_pose))

	pose_embedding = torch.stack(relative_poses, dim=0)
	pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')

	# Create mask sequence of corresponding length
	mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
	condition_end = min(start_frame + initial_condition_frames, max_needed_frames)
	mask[start_frame:condition_end] = 1.0

	camera_embedding = torch.cat([pose_embedding, mask], dim=1)
	print(f"🔧 OpenX synthetic camera embedding shape: {camera_embedding.shape}")
	return camera_embedding.to(torch.bfloat16)


	def generate_nuscenes_camera_embeddings_sliding(
	scene_info, start_frame, initial_condition_frames, new_frames):
	"""
	Generate camera embeddings for NuScenes dataset - sliding window version

	corrected version, consistent with train_moe.py
	"""
	time_compression_ratio = 4

	# Calculate the actual number of camera frames needed for FramePack
	framepack_needed_frames = 1 + 16 + 2 + 1 + new_frames

	if scene_info is not None and 'keyframe_poses' in scene_info:
	print("🔧 Using NuScenes real pose data")
	keyframe_poses = scene_info['keyframe_poses']

	if len(keyframe_poses) == 0:
	print("⚠️ NuScenes keyframe_poses is empty, using zero pose")
	max_needed_frames = max(framepack_needed_frames, 30)

	pose_sequence = torch.zeros(max_needed_frames, 7, dtype=torch.float32)

	mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
	condition_end = min(start_frame + initial_condition_frames, max_needed_frames)
	mask[start_frame:condition_end] = 1.0

	camera_embedding = torch.cat([pose_sequence, mask], dim=1) # [max_needed_frames, 8]
	print(f"🔧 NuScenes zero pose embedding shape: {camera_embedding.shape}")
	return camera_embedding.to(torch.bfloat16)

	# Use first pose as reference
	reference_pose = keyframe_poses[0]

	max_needed_frames = max(framepack_needed_frames, 30)

	pose_vecs = []
	for i in range(max_needed_frames):
	if i < len(keyframe_poses):
	current_pose = keyframe_poses[i]

	# Calculate relative displacement
	translation = torch.tensor(
	np.array(current_pose['translation']) - np.array(reference_pose['translation']),
	dtype=torch.float32
	)

	# Calculate relative rotation (simplified version)
	rotation = torch.tensor(current_pose['rotation'], dtype=torch.float32)

	pose_vec = torch.cat([translation, rotation], dim=0) # [7D]
	else:
	# Out of range, use zero pose
	pose_vec = torch.cat([
	torch.zeros(3, dtype=torch.float32),
	torch.tensor([1.0, 0.0, 0.0, 0.0], dtype=torch.float32)
	], dim=0) # [7D]

	pose_vecs.append(pose_vec)

	pose_sequence = torch.stack(pose_vecs, dim=0) # [max_needed_frames, 7]

	# Create mask
	mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
	condition_end = min(start_frame + initial_condition_frames, max_needed_frames)
	mask[start_frame:condition_end] = 1.0

	camera_embedding = torch.cat([pose_sequence, mask], dim=1) # [max_needed_frames, 8]
	print(f"🔧 NuScenes real pose embedding shape: {camera_embedding.shape}")
	return camera_embedding.to(torch.bfloat16)

	else:
	print("🔧 Using NuScenes synthetic pose data")
	max_needed_frames = max(framepack_needed_frames, 30)

	# Create synthetic motion sequence
	pose_vecs = []
	for i in range(max_needed_frames):
	# Left turn motion mode - similar to left turns in city driving
	angle = i * 0.04 # Rotate 0.08 radians per frame (slightly slower turn)
	radius = 15.0 # Larger turning radius, more suitable for car turns

	# Calculate position on circular arc trajectory
	x = radius * np.sin(angle)
	y = 0.0 # Keep horizontal plane motion
	z = radius * (1 - np.cos(angle))

	translation = torch.tensor([x, y, z], dtype=torch.float32)

	# Vehicle orientation - always along trajectory tangent direction
	yaw = angle + np.pi/2 # Yaw angle relative to initial forward direction
	# Quaternion representation of rotation around Y-axis
	rotation = torch.tensor([
	np.cos(yaw/2), # w (real part)
	0.0, # x
	0.0, # y
	np.sin(yaw/2) # z (imaginary part, around Y-axis)
	], dtype=torch.float32)

	pose_vec = torch.cat([translation, rotation], dim=0) # [7D: tx,ty,tz,qw,qx,qy,qz]
	pose_vecs.append(pose_vec)

	pose_sequence = torch.stack(pose_vecs, dim=0)

	# Create mask
	mask = torch.zeros(max_needed_frames, 1, dtype=torch.float32)
	condition_end = min(start_frame + initial_condition_frames, max_needed_frames)
	mask[start_frame:condition_end] = 1.0

	camera_embedding = torch.cat([pose_sequence, mask], dim=1) # [max_needed_frames, 8]
	print(f"🔧 NuScenes synthetic left turn pose embedding shape: {camera_embedding.shape}")
	return camera_embedding.to(torch.bfloat16)

	def prepare_framepack_sliding_window_with_camera_moe(
	history_latents,
	target_frames_to_generate,
	camera_embedding_full,
	start_frame,
	modality_type,
	max_history_frames=49):
	"""FramePack sliding window mechanism - MoE version"""
	# history_latents: [C, T, H, W] current history latents
	C, T, H, W = history_latents.shape

	# Fixed index structure (this determines the number of camera frames needed)
	# 1 start frame + 16 frames 4x + 2 frames 2x + 1 frame 1x + target_frames_to_generate
	total_indices_length = 1 + 16 + 2 + 1 + target_frames_to_generate
	indices = torch.arange(0, total_indices_length)
	split_sizes = [1, 16, 2, 1, target_frames_to_generate]
	clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = \
	indices.split(split_sizes, dim=0)
	clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=0)

	# Check if camera length is sufficient
	if camera_embedding_full.shape[0] < total_indices_length:
	print(f"⚠️ camera_embedding length insufficient, performing zero padding: current length {camera_embedding_full.shape[0]}, required length {total_indices_length}")
	shortage = total_indices_length - camera_embedding_full.shape[0]
	padding = torch.zeros(shortage, camera_embedding_full.shape[1],
	dtype=camera_embedding_full.dtype, device=camera_embedding_full.device)
	camera_embedding_full = torch.cat([camera_embedding_full, padding], dim=0)

	# Select corresponding part from complete camera sequence
	combined_camera = torch.zeros(
	total_indices_length,
	camera_embedding_full.shape[1],
	dtype=camera_embedding_full.dtype,
	device=camera_embedding_full.device)

	# Camera poses for historical condition frames
	history_slice = camera_embedding_full[max(T - 19, 0):T, :].clone()
	combined_camera[19 - history_slice.shape[0]:19, :] = history_slice

	# Camera poses for target frames
	target_slice = camera_embedding_full[T:T + target_frames_to_generate, :].clone()
	combined_camera[19:19 + target_slice.shape[0], :] = target_slice

	# Reset mask according to current history length
	combined_camera[:, -1] = 0.0 # First set all to target (0)

	# Set condition mask: first 19 frames determined by actual history length
	if T > 0:
	available_frames = min(T, 19)
	start_pos = 19 - available_frames
	combined_camera[start_pos:19, -1] = 1.0 # Mark cameras corresponding to valid clean latents as condition

	print(f"🔧 MoE Camera mask update:")
	print(f" - History frames: {T}")
	print(f" - Valid condition frames: {available_frames if T > 0 else 0}")
	print(f" - Modality type: {modality_type}")

	# Process latents
	clean_latents_combined = torch.zeros(C, 19, H, W, dtype=history_latents.dtype, device=history_latents.device)

	if T > 0:
	available_frames = min(T, 19)
	start_pos = 19 - available_frames
	clean_latents_combined[:, start_pos:, :, :] = history_latents[:, -available_frames:, :, :]

	clean_latents_4x = clean_latents_combined[:, 0:16, :, :]
	clean_latents_2x = clean_latents_combined[:, 16:18, :, :]
	clean_latents_1x = clean_latents_combined[:, 18:19, :, :]

	if T > 0:
	start_latent = history_latents[:, 0:1, :, :]
	else:
	start_latent = torch.zeros(C, 1, H, W, dtype=history_latents.dtype, device=history_latents.device)

	clean_latents = torch.cat([start_latent, clean_latents_1x], dim=1)

	return {
	'latent_indices': latent_indices,
	'clean_latents': clean_latents,
	'clean_latents_2x': clean_latents_2x,
	'clean_latents_4x': clean_latents_4x,
	'clean_latent_indices': clean_latent_indices,
	'clean_latent_2x_indices': clean_latent_2x_indices,
	'clean_latent_4x_indices': clean_latent_4x_indices,
	'camera_embedding': combined_camera,
	'modality_type': modality_type, # Added modality type information
	'current_length': T,
	'next_length': T + target_frames_to_generate
	}

	def overlay_controls(frame_img, pose_vec, icons):
	"""
	Overlay control icons (WASD and arrows) on frame based on camera pose
	pose_vec: 12 elements (flattened 3x4 matrix) + mask
	"""
	if pose_vec is None or np.all(pose_vec[:12] == 0):
	return frame_img

	# Extract translation vector (based on flattened 3x4 matrix indices)
	# [r00, r01, r02, tx, r10, r11, r12, ty, r20, r21, r22, tz]
	tx = pose_vec[3]
	# ty = pose_vec[7]
	tz = pose_vec[11]

	# Extract rotation (yaw and pitch)
	# Yaw: around Y axis. sin(yaw) = r02, cos(yaw) = r00
	r00 = pose_vec[0]
	r02 = pose_vec[2]
	yaw = np.arctan2(r02, r00)

	# Pitch: around X axis. sin(pitch) = -r12, cos(pitch) = r22
	r12 = pose_vec[6]
	r22 = pose_vec[10]
	pitch = np.arctan2(-r12, r22)

	# Threshold for key activation
	TRANS_THRESH = 0.01
	ROT_THRESH = 0.005

	# Determine key states
	# Translation (WASD)
	# Assume -Z is forward, +X is right
	is_forward = tz < -TRANS_THRESH
	is_backward = tz > TRANS_THRESH
	is_left = tx < -TRANS_THRESH
	is_right = tx > TRANS_THRESH

	# Rotation (arrows)
	# Yaw: + is left, - is right
	is_turn_left = yaw > ROT_THRESH
	is_turn_right = yaw < -ROT_THRESH

	# Pitch: + is down, - is up
	is_turn_up = pitch < -ROT_THRESH
	is_turn_down = pitch > ROT_THRESH

	W, H = frame_img.size
	spacing = 60

	def paste_icon(name_active, name_inactive, is_active, x, y):
	name = name_active if is_active else name_inactive
	if name in icons:
	icon = icons[name]
	# Paste using alpha channel
	frame_img.paste(icon, (int(x), int(y)), icon)

	# Overlay WASD (bottom left)
	base_x_right = 100
	base_y = H - 100

	# W
	paste_icon('move_forward.png', 'not_move_forward.png', is_forward, base_x_right, base_y - spacing)
	# A
	paste_icon('move_left.png', 'not_move_left.png', is_left, base_x_right - spacing, base_y)
	# S
	paste_icon('move_backward.png', 'not_move_backward.png', is_backward, base_x_right, base_y)
	# D
	paste_icon('move_right.png', 'not_move_right.png', is_right, base_x_right + spacing, base_y)

	# Overlay arrows (bottom right)
	base_x_left = W - 150

	# ↑
	paste_icon('turn_up.png', 'not_turn_up.png', is_turn_up, base_x_left, base_y - spacing)
	# ←
	paste_icon('turn_left.png', 'not_turn_left.png', is_turn_left, base_x_left - spacing, base_y)
	# ↓
	paste_icon('turn_down.png', 'not_turn_down.png', is_turn_down, base_x_left, base_y)
	# →
	paste_icon('turn_right.png', 'not_turn_right.png', is_turn_right, base_x_left + spacing, base_y)

	return frame_img


	def inference_moe_framepack_sliding_window(
	condition_pth_path=None,
	condition_video=None,
	condition_image=None,
	dit_path=None,
	wan_model_path=None,
	output_path="../examples/output_videos/output_moe_framepack_sliding.mp4",
	start_frame=0,
	initial_condition_frames=8,
	frames_per_generation=4,
	total_frames_to_generate=32,
	max_history_frames=49,
	device="cuda",
	prompt="A video of a scene shot using a pedestrian's front camera while walking",
	modality_type="sekai", # "sekai" or "nuscenes"
	use_real_poses=True,
	scene_info_path=None, # For NuScenes dataset
	# CFG parameters
	use_camera_cfg=True,
	camera_guidance_scale=2.0,
	text_guidance_scale=1.0,
	# MoE parameters
	moe_num_experts=4,
	moe_top_k=2,
	moe_hidden_dim=None,
	direction="left",
	use_gt_prompt=True,
	add_icons=False
	):
	"""
	MoE FramePack sliding window video generation - multi-modal support
	"""
	# Create output directory
	dir_path = os.path.dirname(output_path)
	os.makedirs(dir_path, exist_ok=True)

	print(f"🔧 Starting MoE FramePack sliding window generation...")
	print(f" Modality type: {modality_type}")
	print(f" Camera CFG: {use_camera_cfg}, Camera guidance scale: {camera_guidance_scale}")
	print(f" Text guidance scale: {text_guidance_scale}")
	print(f" MoE config: experts={moe_num_experts}, top_k={moe_top_k}")

	# 1. Model initialization
	replace_dit_model_in_manager()

	model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
	model_manager.load_models([
	os.path.join(wan_model_path, "diffusion_pytorch_model.safetensors"),
	os.path.join(wan_model_path, "models_t5_umt5-xxl-enc-bf16.pth"),
	os.path.join(wan_model_path, "Wan2.1_VAE.pth"),
	])
	pipe = WanVideoAstraPipeline.from_model_manager(model_manager, device="cuda")

	# 2. Add traditional camera encoder (compatibility)
	dim = pipe.dit.blocks[0].self_attn.q.weight.shape[0]
	for block in pipe.dit.blocks:
	block.cam_encoder = nn.Linear(13, dim)
	block.projector = nn.Linear(dim, dim)
	block.cam_encoder.weight.data.zero_()
	block.cam_encoder.bias.data.zero_()
	block.projector.weight = nn.Parameter(torch.eye(dim))
	block.projector.bias = nn.Parameter(torch.zeros(dim))

	# 3. Add FramePack components
	add_framepack_components(pipe.dit)

	# 4. Add MoE components
	moe_config = {
	"num_experts": moe_num_experts,
	"top_k": moe_top_k,
	"hidden_dim": moe_hidden_dim or dim * 2,
	"sekai_input_dim": 13, # Sekai: 12-dim pose + 1-dim mask
	"nuscenes_input_dim": 8, # NuScenes: 7-dim pose + 1-dim mask
	"openx_input_dim": 13 # OpenX: 12-dim pose + 1-dim mask (similar to sekai)
	}
	add_moe_components(pipe.dit, moe_config)

	# 5. Load trained weights
	dit_state_dict = torch.load(dit_path, map_location="cpu")
	pipe.dit.load_state_dict(dit_state_dict, strict=False) # Use strict=False to be compatible with newly added MoE components
	pipe = pipe.to(device)
	model_dtype = next(pipe.dit.parameters()).dtype

	if hasattr(pipe.dit, 'clean_x_embedder'):
	pipe.dit.clean_x_embedder = pipe.dit.clean_x_embedder.to(dtype=model_dtype)

	# Set denoising steps
	pipe.scheduler.set_timesteps(50)

	# 6. Load initial conditions
	print("Loading initial condition frames...")
	initial_latents, encoded_data = load_or_encode_condition(
	condition_pth_path,
	condition_video,
	condition_image,
	start_frame,
	initial_condition_frames,
	device,
	pipe,
	)

	# Spatial cropping
	target_height, target_width = 60, 104
	C, T, H, W = initial_latents.shape

	if H > target_height or W > target_width:
	h_start = (H - target_height) // 2
	w_start = (W - target_width) // 2
	initial_latents = initial_latents[:, :, h_start:h_start+target_height, w_start:w_start+target_width]
	H, W = target_height, target_width

	history_latents = initial_latents.to(device, dtype=model_dtype)

	print(f"Initial history_latents shape: {history_latents.shape}")

	# 7. Encode prompt - support CFG
	if use_gt_prompt and 'prompt_emb' in encoded_data:
	print("✅ Using pre-encoded GT prompt embedding")
	prompt_emb_pos = encoded_data['prompt_emb']
	# Move prompt_emb to correct device and dtype
	if 'context' in prompt_emb_pos:
	prompt_emb_pos['context'] = prompt_emb_pos['context'].to(device, dtype=model_dtype)
	if 'context_mask' in prompt_emb_pos:
	prompt_emb_pos['context_mask'] = prompt_emb_pos['context_mask'].to(device, dtype=model_dtype)

	# Generate negative prompt if using Text CFG
	if text_guidance_scale > 1.0:
	prompt_emb_neg = pipe.encode_prompt("")
	print(f"Using Text CFG with GT prompt, guidance scale: {text_guidance_scale}")
	else:
	prompt_emb_neg = None
	print("Not using Text CFG")

	# Print GT prompt text if available
	if 'prompt' in encoded_data['prompt_emb']:
	gt_prompt_text = encoded_data['prompt_emb']['prompt']
	print(f"📝 GT Prompt text: {gt_prompt_text}")
	else:
	# Re-encode using provided prompt parameter
	print(f"🔄 Re-encoding prompt: {prompt}")
	if text_guidance_scale > 1.0:
	prompt_emb_pos = pipe.encode_prompt(prompt)
	prompt_emb_neg = pipe.encode_prompt("")
	print(f"Using Text CFG, guidance scale: {text_guidance_scale}")
	else:
	prompt_emb_pos = pipe.encode_prompt(prompt)
	prompt_emb_neg = None
	print("Not using Text CFG")

	# 8. Load scene information (for NuScenes)
	scene_info = None
	if modality_type == "nuscenes" and scene_info_path and os.path.exists(scene_info_path):
	with open(scene_info_path, 'r') as f:
	scene_info = json.load(f)
	print(f"Loading NuScenes scene information: {scene_info_path}")

	# 9. Pre-generate complete camera embedding sequence
	if modality_type == "sekai":
	camera_embedding_full = generate_sekai_camera_embeddings_sliding(
	encoded_data.get('cam_emb', None),
	start_frame,
	initial_condition_frames,
	total_frames_to_generate,
	0,
	use_real_poses=use_real_poses,
	direction=direction
	).to(device, dtype=model_dtype)
	elif modality_type == "nuscenes":
	camera_embedding_full = generate_nuscenes_camera_embeddings_sliding(
	scene_info,
	start_frame,
	initial_condition_frames,
	total_frames_to_generate
	).to(device, dtype=model_dtype)
	elif modality_type == "openx":
	camera_embedding_full = generate_openx_camera_embeddings_sliding(
	encoded_data,
	start_frame,
	initial_condition_frames,
	total_frames_to_generate,
	use_real_poses=use_real_poses
	).to(device, dtype=model_dtype)
	else:
	raise ValueError(f"Unsupported modality type: {modality_type}")

	print(f"Complete camera sequence shape: {camera_embedding_full.shape}")

	# 10. Create unconditional camera embedding for Camera CFG
	if use_camera_cfg:
	camera_embedding_uncond = torch.zeros_like(camera_embedding_full)
	print(f"Creating unconditional camera embedding for CFG")

	# 11. Sliding window generation loop
	total_generated = 0
	all_generated_frames = []

	while total_generated < total_frames_to_generate:
	current_generation = min(frames_per_generation, total_frames_to_generate - total_generated)
	print(f"\nGeneration step {total_generated // frames_per_generation + 1}")
	print(f"Current history length: {history_latents.shape[1]}, generating: {current_generation}")

	# FramePack data preparation - MoE version
	framepack_data = prepare_framepack_sliding_window_with_camera_moe(
	history_latents,
	current_generation,
	camera_embedding_full,
	start_frame,
	modality_type,
	max_history_frames
	)

	# Prepare input
	clean_latents = framepack_data['clean_latents'].unsqueeze(0)
	clean_latents_2x = framepack_data['clean_latents_2x'].unsqueeze(0)
	clean_latents_4x = framepack_data['clean_latents_4x'].unsqueeze(0)
	camera_embedding = framepack_data['camera_embedding'].unsqueeze(0)

	# Prepare modality_inputs
	modality_inputs = {modality_type: camera_embedding}

	# Prepare unconditional camera embedding for CFG
	if use_camera_cfg:
	camera_embedding_uncond_batch = camera_embedding_uncond[:camera_embedding.shape[1], :].unsqueeze(0)
	modality_inputs_uncond = {modality_type: camera_embedding_uncond_batch}

	# Index processing
	latent_indices = framepack_data['latent_indices'].unsqueeze(0).cpu()
	clean_latent_indices = framepack_data['clean_latent_indices'].unsqueeze(0).cpu()
	clean_latent_2x_indices = framepack_data['clean_latent_2x_indices'].unsqueeze(0).cpu()
	clean_latent_4x_indices = framepack_data['clean_latent_4x_indices'].unsqueeze(0).cpu()

	# Initialize latents to generate
	new_latents = torch.randn(
	1, C, current_generation, H, W,
	device=device, dtype=model_dtype
	)

	extra_input = pipe.prepare_extra_input(new_latents)

	print(f"Camera embedding shape: {camera_embedding.shape}")
	print(f"Camera mask distribution - condition: {torch.sum(camera_embedding[0, :, -1] == 1.0).item()}, target: {torch.sum(camera_embedding[0, :, -1] == 0.0).item()}")

	# Denoising loop - supports CFG
	timesteps = pipe.scheduler.timesteps

	for i, timestep in enumerate(timesteps):
	if i % 10 == 0:
	print(f" Denoising step {i+1}/{len(timesteps)}")

	timestep_tensor = timestep.unsqueeze(0).to(device, dtype=model_dtype)

	with torch.no_grad():
	# CFG inference
	if use_camera_cfg and camera_guidance_scale > 1.0:
	# Conditional prediction (with camera)
	noise_pred_cond, moe_loess = pipe.dit(
	new_latents,
	timestep=timestep_tensor,
	cam_emb=camera_embedding,
	modality_inputs=modality_inputs, # MoE modality input
	latent_indices=latent_indices,
	clean_latents=clean_latents,
	clean_latent_indices=clean_latent_indices,
	clean_latents_2x=clean_latents_2x,
	clean_latent_2x_indices=clean_latent_2x_indices,
	clean_latents_4x=clean_latents_4x,
	clean_latent_4x_indices=clean_latent_4x_indices,
	**prompt_emb_pos,
	**extra_input
	)

	# Unconditional prediction (no camera)
	noise_pred_uncond, moe_loess = pipe.dit(
	new_latents,
	timestep=timestep_tensor,
	cam_emb=camera_embedding_uncond_batch,
	modality_inputs=modality_inputs_uncond, # MoE unconditional modality input
	latent_indices=latent_indices,
	clean_latents=clean_latents,
	clean_latent_indices=clean_latent_indices,
	clean_latents_2x=clean_latents_2x,
	clean_latent_2x_indices=clean_latent_2x_indices,
	clean_latents_4x=clean_latents_4x,
	clean_latent_4x_indices=clean_latent_4x_indices,
	**(prompt_emb_neg if prompt_emb_neg else prompt_emb_pos),
	**extra_input
	)

	# Camera CFG
	noise_pred = noise_pred_uncond + camera_guidance_scale * (noise_pred_cond - noise_pred_uncond)

	# If using Text CFG at the same time
	if text_guidance_scale > 1.0 and prompt_emb_neg:
	noise_pred_text_uncond, moe_loess = pipe.dit(
	new_latents,
	timestep=timestep_tensor,
	cam_emb=camera_embedding,
	modality_inputs=modality_inputs,
	latent_indices=latent_indices,
	clean_latents=clean_latents,
	clean_latent_indices=clean_latent_indices,
	clean_latents_2x=clean_latents_2x,
	clean_latent_2x_indices=clean_latent_2x_indices,
	clean_latents_4x=clean_latents_4x,
	clean_latent_4x_indices=clean_latent_4x_indices,
	**prompt_emb_neg,
	**extra_input
	)

	# Apply Text CFG to results that have already applied Camera CFG
	noise_pred = noise_pred_text_uncond + text_guidance_scale * (noise_pred - noise_pred_text_uncond)

	elif text_guidance_scale > 1.0 and prompt_emb_neg:
	# Use Text CFG only
	noise_pred_cond, moe_loess = pipe.dit(
	new_latents,
	timestep=timestep_tensor,
	cam_emb=camera_embedding,
	modality_inputs=modality_inputs,
	latent_indices=latent_indices,
	clean_latents=clean_latents,
	clean_latent_indices=clean_latent_indices,
	clean_latents_2x=clean_latents_2x,
	clean_latent_2x_indices=clean_latent_2x_indices,
	clean_latents_4x=clean_latents_4x,
	clean_latent_4x_indices=clean_latent_4x_indices,
	**prompt_emb_pos,
	**extra_input
	)

	noise_pred_uncond, moe_loess= pipe.dit(
	new_latents,
	timestep=timestep_tensor,
	cam_emb=camera_embedding,
	modality_inputs=modality_inputs,
	latent_indices=latent_indices,
	clean_latents=clean_latents,
	clean_latent_indices=clean_latent_indices,
	clean_latents_2x=clean_latents_2x,
	clean_latent_2x_indices=clean_latent_2x_indices,
	clean_latents_4x=clean_latents_4x,
	clean_latent_4x_indices=clean_latent_4x_indices,
	**prompt_emb_neg,
	**extra_input
	)

	noise_pred = noise_pred_uncond + text_guidance_scale * (noise_pred_cond - noise_pred_uncond)

	else:
	# Standard inference (no CFG)
	noise_pred, moe_loess = pipe.dit(
	new_latents,
	timestep=timestep_tensor,
	cam_emb=camera_embedding,
	modality_inputs=modality_inputs, # MoE modality input
	latent_indices=latent_indices,
	clean_latents=clean_latents,
	clean_latent_indices=clean_latent_indices,
	clean_latents_2x=clean_latents_2x,
	clean_latent_2x_indices=clean_latent_2x_indices,
	clean_latents_4x=clean_latents_4x,
	clean_latent_4x_indices=clean_latent_4x_indices,
	**prompt_emb_pos,
	**extra_input
	)

	new_latents = pipe.scheduler.step(noise_pred, timestep, new_latents)

	# Update history
	new_latents_squeezed = new_latents.squeeze(0)
	history_latents = torch.cat([history_latents, new_latents_squeezed], dim=1)

	# Maintain sliding window
	if history_latents.shape[1] > max_history_frames:
	first_frame = history_latents[:, 0:1, :, :]
	recent_frames = history_latents[:, -(max_history_frames-1):, :, :]
	history_latents = torch.cat([first_frame, recent_frames], dim=1)
	print(f"⚠️ History window full, keeping first frame + latest {max_history_frames-1} frames")

	print(f"History_latents shape after update: {history_latents.shape}")

	all_generated_frames.append(new_latents_squeezed)
	total_generated += current_generation

	print(f"✅ Generated {total_generated}/{total_frames_to_generate} frames")

	# 12. Decode and save
	print("\nDecoding generated video...")

	all_generated = torch.cat(all_generated_frames, dim=1)
	final_video = torch.cat([initial_latents.to(all_generated.device), all_generated], dim=1).unsqueeze(0)

	print(f"Final video shape: {final_video.shape}")

	decoded_video = pipe.decode_video(final_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16))

	print(f"Saving video to {output_path} ...")

	video_np = decoded_video[0].to(torch.float32).permute(1, 2, 3, 0).cpu().numpy()
	video_np = (video_np * 0.5 + 0.5).clip(0, 1)
	video_np = (video_np * 255).astype(np.uint8)

	icons = {}
	video_camera_poses = None
	if add_icons:
	# Load icon resources for overlay
	icons_dir = os.path.join(ROOT_DIR, 'icons')
	icon_names = ['move_forward.png', 'not_move_forward.png',
	'move_backward.png', 'not_move_backward.png',
	'move_left.png', 'not_move_left.png',
	'move_right.png', 'not_move_right.png',
	'turn_up.png', 'not_turn_up.png',
	'turn_down.png', 'not_turn_down.png',
	'turn_left.png', 'not_turn_left.png',
	'turn_right.png', 'not_turn_right.png']
	for name in icon_names:
	path = os.path.join(icons_dir, name)
	if os.path.exists(path):
	try:
	icon = Image.open(path).convert("RGBA")
	# Adjust icon size
	icon = icon.resize((50, 50), Image.Resampling.LANCZOS)
	icons[name] = icon
	except Exception as e:
	print(f"Error loading icon {name}: {e}")
	else:
	print(f"⚠️ Warning: Icon {name} not found at {path}")

	# Get camera poses corresponding to video frames
	time_compression_ratio = 4
	camera_poses = camera_embedding_full.detach().float().cpu().numpy()
	video_camera_poses = [x for x in camera_poses for _ in range(time_compression_ratio)]

	with imageio.get_writer(output_path, fps=20) as writer:
	for i, frame in enumerate(video_np):
	# Convert to PIL for overlay
	img = Image.fromarray(frame)

	if add_icons and video_camera_poses is not None and icons:
	# Video frame i corresponds to camera_embedding_full[start_frame + i]
	pose_idx = start_frame + i
	if pose_idx < len(video_camera_poses):
	pose_vec = video_camera_poses[pose_idx]
	img = overlay_controls(img, pose_vec, icons)

	writer.append_data(np.array(img))

	print(f"✅ MoE FramePack sliding window generation completed! Saved to: {output_path}")
	print(f" Total generated {total_generated} frames (compressed), corresponding to original {total_generated * 4} frames")
	print(f" Using modality: {modality_type}")


	def main():
	parser = argparse.ArgumentParser(description="MoE FramePack sliding window video generation - supports multi-modal")

	# Basic parameters
	parser.add_argument("--condition_pth",
	type=str,
	default=None,
	help="Path to pre-encoded condition pth file")
	parser.add_argument("--condition_video",
	type=str,
	default=None,
	help="Input video for novel view synthesis.")
	parser.add_argument("--condition_image",
	type=str,
	default=None,
	required=True,
	help="Input image for novel view synthesis.")
	parser.add_argument("--start_frame", type=int, default=0)
	parser.add_argument("--initial_condition_frames", type=int, default=1)
	parser.add_argument("--frames_per_generation", type=int, default=8)
	parser.add_argument("--total_frames_to_generate", type=int, default=24)
	parser.add_argument("--max_history_frames", type=int, default=100)
	parser.add_argument("--use_real_poses", default=False)
	parser.add_argument("--dit_path", type=str,
	default="../models/Astra/checkpoints/diffusion_pytorch_model.ckpt",
	help="path to the pretrained DiT MoE model checkpoint")
	parser.add_argument("--wan_model_path",
	type=str,
	default="../models/Wan-AI/Wan2.1-T2V-1.3B",
	help="path to Wan2.1-T2V-1.3B")
	parser.add_argument("--output_path", type=str,
	default='../examples/output_videos/output_moe_framepack_sliding.mp4')
	parser.add_argument("--prompt",
	type=str,
	default="",
	help="text prompt for video generation")
	parser.add_argument("--device", type=str, default="cuda")
	parser.add_argument("--add_icons", action="store_true", default=False,
	help="Overlay control icons on generated video")

	# Modality type parameters
	parser.add_argument("--modality_type", type=str, choices=["sekai", "nuscenes", "openx"],
	default="sekai", help="Modality type: sekai, nuscenes, or openx")
	parser.add_argument("--scene_info_path", type=str, default=None,
	help="NuScenes scene info file path (for nuscenes modality only)")

	# CFG parameters
	parser.add_argument("--use_camera_cfg", default=False,
	help="Use Camera CFG")
	parser.add_argument("--camera_guidance_scale", type=float, default=2.0,
	help="Camera guidance scale for CFG")
	parser.add_argument("--text_guidance_scale", type=float, default=1.0,
	help="Text guidance scale for CFG")

	# MoE parameters
	parser.add_argument("--moe_num_experts", type=int, default=3, help="Number of experts")
	parser.add_argument("--moe_top_k", type=int, default=1, help="Top-K experts")
	parser.add_argument("--moe_hidden_dim", type=int, default=None, help="MoE hidden dimension")
	parser.add_argument("--direction", type=str, default="left", help="Direction of video trajectory")
	parser.add_argument("--use_gt_prompt", action="store_true", default=False,
	help="Use ground truth prompt embedding from dataset")

	args = parser.parse_args()

	print(f"MoE FramePack CFG generation settings:")
	print(f"Modality type: {args.modality_type}")
	print(f"Camera CFG: {args.use_camera_cfg}")
	if args.use_camera_cfg:
	print(f"Camera guidance scale: {args.camera_guidance_scale}")
	print(f"Using GT Prompt: {args.use_gt_prompt}")
	print(f"Text guidance scale: {args.text_guidance_scale}")
	print(f"MoE config: experts={args.moe_num_experts}, top_k={args.moe_top_k}")
	print(f"DiT{args.dit_path}")

	# Validate NuScenes parameters
	if args.modality_type == "nuscenes" and not args.scene_info_path:
	print("⚠️ Warning: Using NuScenes modality but scene_info_path not provided, will use synthetic pose data")

	if not args.use_gt_prompt and (args.prompt is None or args.prompt.strip() == ""):
	print("⚠️ Warning: No prompt provided, will use empty string as prompt")

	if not any([args.condition_pth, args.condition_video, args.condition_image]):
	raise ValueError("Need to provide condition_pth, condition_video, or condition_image as condition input")

	if args.condition_pth:
	print(f"Using pre-encoded pth: {args.condition_pth}")
	elif args.condition_video:
	print(f"Using condition video for online encoding: {args.condition_video}")
	elif args.condition_image:
	print(f"Using condition image for online encoding: {args.condition_image} (repeat 10 frames)")

	inference_moe_framepack_sliding_window(
	condition_pth_path=args.condition_pth,
	condition_video=args.condition_video,
	condition_image=args.condition_image,
	dit_path=args.dit_path,
	wan_model_path=args.wan_model_path,
	output_path=args.output_path,
	start_frame=args.start_frame,
	initial_condition_frames=args.initial_condition_frames,
	frames_per_generation=args.frames_per_generation,
	total_frames_to_generate=args.total_frames_to_generate,
	max_history_frames=args.max_history_frames,
	device=args.device,
	prompt=args.prompt,
	modality_type=args.modality_type,
	use_real_poses=args.use_real_poses,
	scene_info_path=args.scene_info_path,
	# CFG parameters
	use_camera_cfg=args.use_camera_cfg,
	camera_guidance_scale=args.camera_guidance_scale,
	text_guidance_scale=args.text_guidance_scale,
	# MoE parameters
	moe_num_experts=args.moe_num_experts,
	moe_top_k=args.moe_top_k,
	moe_hidden_dim=args.moe_hidden_dim,
	direction=args.direction,
	use_gt_prompt=args.use_gt_prompt,
	add_icons=args.add_icons
	)


	if __name__ == "__main__":
	main()