| import argparse |
| import json |
| import os |
| import sys |
| from types import SimpleNamespace |
|
|
|
|
| def ensure_dummy_dataset(base_dir: str, num_entries: int = 20) -> str: |
| try: |
| import numpy as np |
| import imageio.v2 as imageio |
| except Exception as exc: |
| raise RuntimeError("imageio and numpy are required to build the dummy dataset.") from exc |
|
|
| os.makedirs(base_dir, exist_ok=True) |
| video_path = os.path.join(base_dir, "dummy.mp4") |
| json_path = os.path.join(base_dir, "dummy_dataset.json") |
|
|
| if not os.path.exists(video_path): |
| writer = imageio.get_writer(video_path, fps=16) |
| for i in range(90): |
| frame = np.zeros((128, 128, 3), dtype=np.uint8) |
| frame[:, :, 0] = (i * 3) % 255 |
| frame[:, :, 1] = (i * 7) % 255 |
| frame[:, :, 2] = (i * 13) % 255 |
| writer.append_data(frame) |
| writer.close() |
|
|
| facedetect_v1 = [ |
| [{"angle": {"yaw": 0, "pitch": 0, "roll": 0}, "detect": {"top": 10, "height": 100, "width": 100, "left": 10, "prob": 1.0}}], |
| [{"angle": {"yaw": 60, "pitch": 0, "roll": 0}, "detect": {"top": 10, "height": 100, "width": 100, "left": 10, "prob": 1.0}}], |
| [{"angle": {"yaw": -60, "pitch": 0, "roll": 0}, "detect": {"top": 10, "height": 100, "width": 100, "left": 10, "prob": 1.0}}], |
| ] |
| facedetect_v1_frame_index = [0, 20, 40] |
|
|
| context = { |
| "disk_path": video_path, |
| "text": "dummy caption", |
| "facedetect_v1": facedetect_v1, |
| "facedetect_v1_frame_index": facedetect_v1_frame_index, |
| } |
| meta = {f"video_{i}": context for i in range(num_entries)} |
|
|
| with open(json_path, "w") as f: |
| json.dump(meta, f) |
|
|
| return json_path |
|
|
|
|
| def build_args() -> SimpleNamespace: |
| return SimpleNamespace( |
| zero_face_ratio=0.0, |
| shot_rope=False, |
| split_rope=False, |
| split1=False, |
| split2=False, |
| split3=False, |
| ) |
|
|
|
|
| def main() -> int: |
| os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0") |
| parser = argparse.ArgumentParser(description="Dry-run the full training pipeline (single forward).") |
| parser.add_argument("--json_path", type=str, default="", help="Existing dataset JSON path.") |
| parser.add_argument("--resolution", type=int, nargs=2, default=[64, 64], help="Resolution as H W.") |
| parser.add_argument("--ref_num", type=int, default=3, help="Reference image count.") |
| parser.add_argument("--base_dir", type=str, default="/data/rczhang/PencilFolder/multi-shot/tmp_dryrun") |
| parser.add_argument("--model_root", type=str, default="/data/rczhang/PencilFolder/DiffSynth-Studio/models") |
| parser.add_argument("--model_id", type=str, default="Wan-AI/Wan2.2-TI2V-5B") |
| args = parser.parse_args() |
|
|
| if args.json_path: |
| json_path = args.json_path |
| else: |
| json_path = ensure_dummy_dataset(args.base_dir) |
|
|
| sys.path.insert(0, "/data/rczhang/PencilFolder/multi-shot") |
| from multi_view.datasets.videodataset import MulltiShot_MultiView_Dataset |
| from multi_view.train import WanTrainingModule |
|
|
| dataset = MulltiShot_MultiView_Dataset( |
| dataset_base_path=json_path, |
| resolution=tuple(args.resolution), |
| ref_num=args.ref_num, |
| training=True, |
| ) |
| batch = [dataset[0]] |
|
|
| model_id_with_origin_paths = ",".join([ |
| f"{args.model_id}:diffusion_pytorch_model*.safetensors", |
| f"{args.model_id}:models_t5_umt5-xxl-enc-bf16.pth", |
| f"{args.model_id}:Wan2.2_VAE.pth", |
| ]) |
|
|
| model = WanTrainingModule( |
| model_id_with_origin_paths=model_id_with_origin_paths, |
| trainable_models="dit", |
| local_model_path=args.model_root, |
| ) |
|
|
| import torch |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| model.to(device) |
| model.pipe.device = device |
| model.pipe.torch_dtype = torch.bfloat16 |
| print("Dry-run device:", device) |
|
|
| runtime_args = build_args() |
| inputs = model.forward_preprocess(batch) |
| loss = model.forward(batch, runtime_args, inputs=inputs) |
| print("Dry-run loss:", float(loss.detach().cpu().item())) |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|