multishot / dry_run_train.py

Upload folder using huggingface_hub

85752bc verified 3 months ago

4.13 kB

	import argparse
	import json
	import os
	import sys
	from types import SimpleNamespace


	def ensure_dummy_dataset(base_dir: str, num_entries: int = 20) -> str:
	try:
	import numpy as np
	import imageio.v2 as imageio
	except Exception as exc:
	raise RuntimeError("imageio and numpy are required to build the dummy dataset.") from exc

	os.makedirs(base_dir, exist_ok=True)
	video_path = os.path.join(base_dir, "dummy.mp4")
	json_path = os.path.join(base_dir, "dummy_dataset.json")

	if not os.path.exists(video_path):
	writer = imageio.get_writer(video_path, fps=16)
	for i in range(90):
	frame = np.zeros((128, 128, 3), dtype=np.uint8)
	frame[:, :, 0] = (i * 3) % 255
	frame[:, :, 1] = (i * 7) % 255
	frame[:, :, 2] = (i * 13) % 255
	writer.append_data(frame)
	writer.close()

	facedetect_v1 = [
	[{"angle": {"yaw": 0, "pitch": 0, "roll": 0}, "detect": {"top": 10, "height": 100, "width": 100, "left": 10, "prob": 1.0}}],
	[{"angle": {"yaw": 60, "pitch": 0, "roll": 0}, "detect": {"top": 10, "height": 100, "width": 100, "left": 10, "prob": 1.0}}],
	[{"angle": {"yaw": -60, "pitch": 0, "roll": 0}, "detect": {"top": 10, "height": 100, "width": 100, "left": 10, "prob": 1.0}}],
	]
	facedetect_v1_frame_index = [0, 20, 40]

	context = {
	"disk_path": video_path,
	"text": "dummy caption",
	"facedetect_v1": facedetect_v1,
	"facedetect_v1_frame_index": facedetect_v1_frame_index,
	}
	meta = {f"video_{i}": context for i in range(num_entries)}

	with open(json_path, "w") as f:
	json.dump(meta, f)

	return json_path


	def build_args() -> SimpleNamespace:
	return SimpleNamespace(
	zero_face_ratio=0.0,
	shot_rope=False,
	split_rope=False,
	split1=False,
	split2=False,
	split3=False,
	)


	def main() -> int:
	os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
	parser = argparse.ArgumentParser(description="Dry-run the full training pipeline (single forward).")
	parser.add_argument("--json_path", type=str, default="", help="Existing dataset JSON path.")
	parser.add_argument("--resolution", type=int, nargs=2, default=[64, 64], help="Resolution as H W.")
	parser.add_argument("--ref_num", type=int, default=3, help="Reference image count.")
	parser.add_argument("--base_dir", type=str, default="/data/rczhang/PencilFolder/multi-shot/tmp_dryrun")
	parser.add_argument("--model_root", type=str, default="/data/rczhang/PencilFolder/DiffSynth-Studio/models")
	parser.add_argument("--model_id", type=str, default="Wan-AI/Wan2.2-TI2V-5B")
	args = parser.parse_args()

	if args.json_path:
	json_path = args.json_path
	else:
	json_path = ensure_dummy_dataset(args.base_dir)

	sys.path.insert(0, "/data/rczhang/PencilFolder/multi-shot")
	from multi_view.datasets.videodataset import MulltiShot_MultiView_Dataset
	from multi_view.train import WanTrainingModule

	dataset = MulltiShot_MultiView_Dataset(
	dataset_base_path=json_path,
	resolution=tuple(args.resolution),
	ref_num=args.ref_num,
	training=True,
	)
	batch = [dataset[0]]

	model_id_with_origin_paths = ",".join([
	f"{args.model_id}:diffusion_pytorch_model*.safetensors",
	f"{args.model_id}:models_t5_umt5-xxl-enc-bf16.pth",
	f"{args.model_id}:Wan2.2_VAE.pth",
	])

	model = WanTrainingModule(
	model_id_with_origin_paths=model_id_with_origin_paths,
	trainable_models="dit",
	local_model_path=args.model_root,
	)

	import torch
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)
	model.pipe.device = device
	model.pipe.torch_dtype = torch.bfloat16
	print("Dry-run device:", device)

	runtime_args = build_args()
	inputs = model.forward_preprocess(batch)
	loss = model.forward(batch, runtime_args, inputs=inputs)
	print("Dry-run loss:", float(loss.detach().cpu().item()))
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())