""" Speed Benchmark — Measure inference speed across hardware/configs. Reports: - Latency (ms per frame) at p50/p95/p99 - Throughput (FPS) - GPU memory usage - Comparison across input resolutions and model variants """ import time import numpy as np from typing import Dict, Optional, List from dataclasses import dataclass import torch @dataclass class BenchmarkResult: """Single benchmark measurement.""" model_name: str input_size: int device: str batch_size: int latency_p50_ms: float latency_p95_ms: float latency_p99_ms: float fps: float gpu_mem_mb: float num_params_m: float gflops: float class SpeedBenchmark: """ Inference speed benchmark for face detection models. Usage: bench = SpeedBenchmark(device='cuda') result = bench.benchmark_model(model, 'scrfd_34g', input_size=640) bench.print_results() """ def __init__(self, device: str = 'cuda', warmup_iters: int = 50, benchmark_iters: int = 200): self.device = device self.warmup_iters = warmup_iters self.benchmark_iters = benchmark_iters self.results: List[BenchmarkResult] = [] @torch.no_grad() def benchmark_model(self, model: torch.nn.Module, model_name: str, input_size: int = 640, batch_size: int = 1) -> BenchmarkResult: """ Benchmark a model's inference speed. Args: model: PyTorch model in eval mode model_name: Name for reporting input_size: Input image resolution batch_size: Batch size for benchmarking Returns: BenchmarkResult with timing statistics """ model = model.to(self.device).eval() dummy_input = torch.randn(batch_size, 3, input_size, input_size, device=self.device) # Count parameters num_params = sum(p.numel() for p in model.parameters()) / 1e6 # Estimate GFLOPs (using torch profiler if available) gflops = self._estimate_flops(model, dummy_input) # GPU memory before if self.device == 'cuda': torch.cuda.reset_peak_memory_stats() torch.cuda.synchronize() # Warmup for _ in range(self.warmup_iters): _ = model(dummy_input) if self.device == 'cuda': torch.cuda.synchronize() # Benchmark latencies = [] for _ in range(self.benchmark_iters): if self.device == 'cuda': torch.cuda.synchronize() t0 = time.perf_counter() _ = model(dummy_input) if self.device == 'cuda': torch.cuda.synchronize() latencies.append((time.perf_counter() - t0) * 1000) # ms latencies = np.array(latencies) gpu_mem = 0 if self.device == 'cuda': gpu_mem = torch.cuda.max_memory_allocated() / 1e6 result = BenchmarkResult( model_name=model_name, input_size=input_size, device=self.device, batch_size=batch_size, latency_p50_ms=np.percentile(latencies, 50), latency_p95_ms=np.percentile(latencies, 95), latency_p99_ms=np.percentile(latencies, 99), fps=1000 / np.mean(latencies) * batch_size, gpu_mem_mb=gpu_mem, num_params_m=num_params, gflops=gflops, ) self.results.append(result) return result def _estimate_flops(self, model: torch.nn.Module, dummy_input: torch.Tensor) -> float: """Estimate GFLOPs (approximate).""" try: from torch.utils.flop_counter import FlopCounterMode flop_counter = FlopCounterMode(display=False) with flop_counter: model(dummy_input) return flop_counter.get_total_flops() / 1e9 except (ImportError, Exception): return 0.0 def print_results(self): """Print formatted benchmark results table.""" if not self.results: print("No benchmark results yet.") return header = (f"{'Model':<15} {'Size':<6} {'Device':<8} {'BS':<4} " f"{'P50(ms)':<9} {'P95(ms)':<9} {'P99(ms)':<9} " f"{'FPS':<8} {'Mem(MB)':<10} {'Params(M)':<10} {'GFLOPs':<8}") print("=" * len(header)) print("Speed Benchmark Results") print("=" * len(header)) print(header) print("-" * len(header)) for r in self.results: print(f"{r.model_name:<15} {r.input_size:<6} {r.device:<8} {r.batch_size:<4} " f"{r.latency_p50_ms:<9.2f} {r.latency_p95_ms:<9.2f} {r.latency_p99_ms:<9.2f} " f"{r.fps:<8.1f} {r.gpu_mem_mb:<10.1f} {r.num_params_m:<10.2f} {r.gflops:<8.2f}") print("=" * len(header)) def to_markdown(self) -> str: """Generate Markdown benchmark table.""" lines = [ "| Model | Input | Device | BS | P50 (ms) | P95 (ms) | FPS | GPU Mem (MB) | Params (M) | GFLOPs |", "|-------|-------|--------|----|---------:|---------:|----:|-------------:|-----------:|-------:|", ] for r in self.results: lines.append( f"| {r.model_name} | {r.input_size} | {r.device} | {r.batch_size} | " f"{r.latency_p50_ms:.2f} | {r.latency_p95_ms:.2f} | {r.fps:.1f} | " f"{r.gpu_mem_mb:.1f} | {r.num_params_m:.2f} | {r.gflops:.2f} |" ) return '\n'.join(lines)