import argparse import json import os import sys from types import SimpleNamespace def ensure_dummy_dataset(base_dir: str, num_entries: int = 20) -> str: try: import numpy as np import imageio.v2 as imageio except Exception as exc: raise RuntimeError("imageio and numpy are required to build the dummy dataset.") from exc os.makedirs(base_dir, exist_ok=True) video_path = os.path.join(base_dir, "dummy.mp4") json_path = os.path.join(base_dir, "dummy_dataset.json") if not os.path.exists(video_path): writer = imageio.get_writer(video_path, fps=16) for i in range(90): frame = np.zeros((128, 128, 3), dtype=np.uint8) frame[:, :, 0] = (i * 3) % 255 frame[:, :, 1] = (i * 7) % 255 frame[:, :, 2] = (i * 13) % 255 writer.append_data(frame) writer.close() facedetect_v1 = [ [{"angle": {"yaw": 0, "pitch": 0, "roll": 0}, "detect": {"top": 10, "height": 100, "width": 100, "left": 10, "prob": 1.0}}], [{"angle": {"yaw": 60, "pitch": 0, "roll": 0}, "detect": {"top": 10, "height": 100, "width": 100, "left": 10, "prob": 1.0}}], [{"angle": {"yaw": -60, "pitch": 0, "roll": 0}, "detect": {"top": 10, "height": 100, "width": 100, "left": 10, "prob": 1.0}}], ] facedetect_v1_frame_index = [0, 20, 40] context = { "disk_path": video_path, "text": "dummy caption", "facedetect_v1": facedetect_v1, "facedetect_v1_frame_index": facedetect_v1_frame_index, } meta = {f"video_{i}": context for i in range(num_entries)} with open(json_path, "w") as f: json.dump(meta, f) return json_path def build_args() -> SimpleNamespace: return SimpleNamespace( zero_face_ratio=0.0, shot_rope=False, split_rope=False, split1=False, split2=False, split3=False, ) def main() -> int: os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0") parser = argparse.ArgumentParser(description="Dry-run the full training pipeline (single forward).") parser.add_argument("--json_path", type=str, default="", help="Existing dataset JSON path.") parser.add_argument("--resolution", type=int, nargs=2, default=[64, 64], help="Resolution as H W.") parser.add_argument("--ref_num", type=int, default=3, help="Reference image count.") parser.add_argument("--base_dir", type=str, default="/data/rczhang/PencilFolder/multi-shot/tmp_dryrun") parser.add_argument("--model_root", type=str, default="/data/rczhang/PencilFolder/DiffSynth-Studio/models") parser.add_argument("--model_id", type=str, default="Wan-AI/Wan2.2-TI2V-5B") args = parser.parse_args() if args.json_path: json_path = args.json_path else: json_path = ensure_dummy_dataset(args.base_dir) sys.path.insert(0, "/data/rczhang/PencilFolder/multi-shot") from multi_view.datasets.videodataset import MulltiShot_MultiView_Dataset from multi_view.train import WanTrainingModule dataset = MulltiShot_MultiView_Dataset( dataset_base_path=json_path, resolution=tuple(args.resolution), ref_num=args.ref_num, training=True, ) batch = [dataset[0]] model_id_with_origin_paths = ",".join([ f"{args.model_id}:diffusion_pytorch_model*.safetensors", f"{args.model_id}:models_t5_umt5-xxl-enc-bf16.pth", f"{args.model_id}:Wan2.2_VAE.pth", ]) model = WanTrainingModule( model_id_with_origin_paths=model_id_with_origin_paths, trainable_models="dit", local_model_path=args.model_root, ) import torch device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) model.pipe.device = device model.pipe.torch_dtype = torch.bfloat16 print("Dry-run device:", device) runtime_args = build_args() inputs = model.forward_preprocess(batch) loss = model.forward(batch, runtime_args, inputs=inputs) print("Dry-run loss:", float(loss.detach().cpu().item())) return 0 if __name__ == "__main__": raise SystemExit(main())