facedet / evaluation /speed_benchmark.py

Upload evaluation/speed_benchmark.py with huggingface_hub

f7417f1 verified 15 days ago

5.63 kB

	"""
	Speed Benchmark — Measure inference speed across hardware/configs.

	Reports:
	- Latency (ms per frame) at p50/p95/p99
	- Throughput (FPS)
	- GPU memory usage
	- Comparison across input resolutions and model variants
	"""

	import time
	import numpy as np
	from typing import Dict, Optional, List
	from dataclasses import dataclass

	import torch


	@dataclass
	class BenchmarkResult:
	"""Single benchmark measurement."""
	model_name: str
	input_size: int
	device: str
	batch_size: int
	latency_p50_ms: float
	latency_p95_ms: float
	latency_p99_ms: float
	fps: float
	gpu_mem_mb: float
	num_params_m: float
	gflops: float


	class SpeedBenchmark:
	"""
	Inference speed benchmark for face detection models.

	Usage:
	bench = SpeedBenchmark(device='cuda')
	result = bench.benchmark_model(model, 'scrfd_34g', input_size=640)
	bench.print_results()
	"""

	def __init__(self, device: str = 'cuda', warmup_iters: int = 50,
	benchmark_iters: int = 200):
	self.device = device
	self.warmup_iters = warmup_iters
	self.benchmark_iters = benchmark_iters
	self.results: List[BenchmarkResult] = []

	@torch.no_grad()
	def benchmark_model(self, model: torch.nn.Module, model_name: str,
	input_size: int = 640, batch_size: int = 1) -> BenchmarkResult:
	"""
	Benchmark a model's inference speed.

	Args:
	model: PyTorch model in eval mode
	model_name: Name for reporting
	input_size: Input image resolution
	batch_size: Batch size for benchmarking

	Returns:
	BenchmarkResult with timing statistics
	"""
	model = model.to(self.device).eval()
	dummy_input = torch.randn(batch_size, 3, input_size, input_size,
	device=self.device)

	# Count parameters
	num_params = sum(p.numel() for p in model.parameters()) / 1e6

	# Estimate GFLOPs (using torch profiler if available)
	gflops = self._estimate_flops(model, dummy_input)

	# GPU memory before
	if self.device == 'cuda':
	torch.cuda.reset_peak_memory_stats()
	torch.cuda.synchronize()

	# Warmup
	for _ in range(self.warmup_iters):
	_ = model(dummy_input)
	if self.device == 'cuda':
	torch.cuda.synchronize()

	# Benchmark
	latencies = []
	for _ in range(self.benchmark_iters):
	if self.device == 'cuda':
	torch.cuda.synchronize()
	t0 = time.perf_counter()
	_ = model(dummy_input)
	if self.device == 'cuda':
	torch.cuda.synchronize()
	latencies.append((time.perf_counter() - t0) * 1000) # ms

	latencies = np.array(latencies)
	gpu_mem = 0
	if self.device == 'cuda':
	gpu_mem = torch.cuda.max_memory_allocated() / 1e6

	result = BenchmarkResult(
	model_name=model_name,
	input_size=input_size,
	device=self.device,
	batch_size=batch_size,
	latency_p50_ms=np.percentile(latencies, 50),
	latency_p95_ms=np.percentile(latencies, 95),
	latency_p99_ms=np.percentile(latencies, 99),
	fps=1000 / np.mean(latencies) * batch_size,
	gpu_mem_mb=gpu_mem,
	num_params_m=num_params,
	gflops=gflops,
	)

	self.results.append(result)
	return result

	def _estimate_flops(self, model: torch.nn.Module,
	dummy_input: torch.Tensor) -> float:
	"""Estimate GFLOPs (approximate)."""
	try:
	from torch.utils.flop_counter import FlopCounterMode
	flop_counter = FlopCounterMode(display=False)
	with flop_counter:
	model(dummy_input)
	return flop_counter.get_total_flops() / 1e9
	except (ImportError, Exception):
	return 0.0

	def print_results(self):
	"""Print formatted benchmark results table."""
	if not self.results:
	print("No benchmark results yet.")
	return

	header = (f"{'Model':<15} {'Size':<6} {'Device':<8} {'BS':<4} "
	f"{'P50(ms)':<9} {'P95(ms)':<9} {'P99(ms)':<9} "
	f"{'FPS':<8} {'Mem(MB)':<10} {'Params(M)':<10} {'GFLOPs':<8}")
	print("=" * len(header))
	print("Speed Benchmark Results")
	print("=" * len(header))
	print(header)
	print("-" * len(header))

	for r in self.results:
	print(f"{r.model_name:<15} {r.input_size:<6} {r.device:<8} {r.batch_size:<4} "
	f"{r.latency_p50_ms:<9.2f} {r.latency_p95_ms:<9.2f} {r.latency_p99_ms:<9.2f} "
	f"{r.fps:<8.1f} {r.gpu_mem_mb:<10.1f} {r.num_params_m:<10.2f} {r.gflops:<8.2f}")

	print("=" * len(header))

	def to_markdown(self) -> str:
	"""Generate Markdown benchmark table."""
	lines = [
	"\| Model \| Input \| Device \| BS \| P50 (ms) \| P95 (ms) \| FPS \| GPU Mem (MB) \| Params (M) \| GFLOPs \|",
	"\|-------\|-------\|--------\|----\|---------:\|---------:\|----:\|-------------:\|-----------:\|-------:\|",
	]
	for r in self.results:
	lines.append(
	f"\| {r.model_name} \| {r.input_size} \| {r.device} \| {r.batch_size} \| "
	f"{r.latency_p50_ms:.2f} \| {r.latency_p95_ms:.2f} \| {r.fps:.1f} \| "
	f"{r.gpu_mem_mb:.1f} \| {r.num_params_m:.2f} \| {r.gflops:.2f} \|"
	)
	return '\n'.join(lines)