ZIP-P / efficiency.py

2025-07-31 17:18 🐣

0b69a1f verified 9 months ago

5.19 kB

	from argparse import ArgumentParser
	import time
	import os
	import torch
	import torchvision.transforms as transforms
	from contextlib import nullcontext
	import json
	from models import get_model


	parser = ArgumentParser(description="Train an EBC model.")
	parser.add_argument("--model_info_path", type=str, required=True, help="Path to the model information file.")

	parser.add_argument("--batch_size", type=int, default=1, help="Batch size for the model.")
	parser.add_argument("--height", type=int, default=768, help="Height of the input image.")
	parser.add_argument("--width", type=int, default=1024, help="Width of the input image.")

	parser.add_argument("--num_iterations", type=int, default=200, help="Number of iterations to run the model.")
	parser.add_argument("--num_warmup", type=int, default=20, help="Dispose of the first N iterations.")

	parser.add_argument("--device", type=str, choices=["cpu", "cuda", "mps"], help="Device to run the model on. Options are 'cpu', 'cuda', or 'mps'.")
	parser.add_argument("--amp", action="store_true", help="Enable autocast mixed precision (fp16/bf16).")
	parser.add_argument("--half", action="store_true", help="Use half precision for the model.")
	parser.add_argument("--channels_last", action="store_true", help="Use NHWC memory format (recommended for CUDA).")
	parser.add_argument("--compile", action="store_true", help="Enable torch.compile if available.")
	parser.add_argument("--threads", type=int, default=None, help="torch.set_num_threads(threads) for CPU")
	parser.add_argument("--sleep_time", type=float, default=0.0, help="Seconds to sleep after each iteration (cool-down).")

	_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])


	def _dummy_input(bs, h, w, device, half, channels_last):
	x = torch.rand(bs, 3, h, w, device=device)
	x = _normalize(x)
	if half:
	x = x.half()
	if channels_last:
	x = x.to(memory_format=torch.channels_last)
	return x


	def _maybe_sync(dev):
	if dev.type == "cuda":
	torch.cuda.synchronize()


	@torch.inference_mode()
	def benchmark(
	model: torch.nn.Module,
	inp: torch.Tensor,
	warmup: int,
	steps: int,
	amp: bool,
	sleep_time: float = 0.0
	):
	cm = torch.autocast(device_type=inp.device.type) if amp else nullcontext()

	# --- warm-up ---
	for _ in range(warmup):
	with cm:
	_ = model(inp)
	_maybe_sync(inp.device)

	# --- timed loop ---
	total_time = 0.0
	for _ in range(steps):
	tic = time.perf_counter()
	with cm:
	_ = model(inp)

	toc = time.perf_counter()
	total_time += toc - tic

	if sleep_time > 0:
	time.sleep(sleep_time)

	_maybe_sync(inp.device)

	fps = steps / total_time
	return fps, total_time / steps


	def main(args):
	assert os.path.isfile(args.model_info_path), \
	f"{args.model_info_path} not found"

	model = get_model(model_info_path=args.model_info_path)
	model.eval()

	if args.channels_last:
	model = model.to(memory_format=torch.channels_last)
	if args.half:
	model = model.half()

	device = torch.device(args.device)
	model = model.to(device)

	if args.compile and hasattr(torch, "compile"):
	model = torch.compile(model, mode="reduce-overhead")

	if args.threads:
	torch.set_num_threads(args.threads)
	torch.set_num_interop_threads(1)

	inp = _dummy_input(
	args.batch_size,
	args.height,
	args.width,
	device,
	args.half,
	args.channels_last
	)

	fps, t_avg = benchmark(
	model,
	inp,
	warmup=args.num_warmup,
	steps=args.num_iterations,
	amp=args.amp,
	sleep_time=args.sleep_time
	)

	cfg = vars(args)
	cfg.pop("model_info_path")
	print(json.dumps(cfg, indent=2))
	print(f"\nAverage latency: {t_avg*1000:6.2f} ms \| FPS: {fps:,.2f}")


	if __name__ == "__main__":
	main(parser.parse_args())


	# CUDA @FP16 + channels_last + torch.compile
	# python efficiency.py \
	# --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
	# --device cuda --half --amp --channels_last --compile

	# CUDA @AMP + channels_last + torch.compile
	# python efficiency.py \
	# --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
	# --device cuda --amp --channels_last --compile

	# CUDA @FP32 + channels_last + torch.compile
	# python efficiency.py \
	# --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
	# --device cuda --channels_last --compile

	# AMD 5900X (12 Core) + channels_last + torch.compile
	# export OMP_NUM_THREADS=12; export MKL_NUM_THREADS=12
	# python efficiency.py \
	# --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
	# --device cpu --threads 12 --channels_last --compile

	# Apple M1 Pro (6 Performance Cores). Compiling makes it slower.
	# export OMP_NUM_THREADS=6; export VECLIB_MAXIMUM_THREADS=6
	# python efficiency.py \
	# --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
	# --device cpu --threads 6

	# Apple M1 Pro MPS @FP32 + torch.compile
	# python efficiency.py \
	# --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
	# --device mps --channels_last --compile