| """ |
| Speed Benchmark — Measure inference speed across hardware/configs. |
| |
| Reports: |
| - Latency (ms per frame) at p50/p95/p99 |
| - Throughput (FPS) |
| - GPU memory usage |
| - Comparison across input resolutions and model variants |
| """ |
|
|
| import time |
| import numpy as np |
| from typing import Dict, Optional, List |
| from dataclasses import dataclass |
|
|
| import torch |
|
|
|
|
| @dataclass |
| class BenchmarkResult: |
| """Single benchmark measurement.""" |
| model_name: str |
| input_size: int |
| device: str |
| batch_size: int |
| latency_p50_ms: float |
| latency_p95_ms: float |
| latency_p99_ms: float |
| fps: float |
| gpu_mem_mb: float |
| num_params_m: float |
| gflops: float |
|
|
|
|
| class SpeedBenchmark: |
| """ |
| Inference speed benchmark for face detection models. |
| |
| Usage: |
| bench = SpeedBenchmark(device='cuda') |
| result = bench.benchmark_model(model, 'scrfd_34g', input_size=640) |
| bench.print_results() |
| """ |
|
|
| def __init__(self, device: str = 'cuda', warmup_iters: int = 50, |
| benchmark_iters: int = 200): |
| self.device = device |
| self.warmup_iters = warmup_iters |
| self.benchmark_iters = benchmark_iters |
| self.results: List[BenchmarkResult] = [] |
|
|
| @torch.no_grad() |
| def benchmark_model(self, model: torch.nn.Module, model_name: str, |
| input_size: int = 640, batch_size: int = 1) -> BenchmarkResult: |
| """ |
| Benchmark a model's inference speed. |
| |
| Args: |
| model: PyTorch model in eval mode |
| model_name: Name for reporting |
| input_size: Input image resolution |
| batch_size: Batch size for benchmarking |
| |
| Returns: |
| BenchmarkResult with timing statistics |
| """ |
| model = model.to(self.device).eval() |
| dummy_input = torch.randn(batch_size, 3, input_size, input_size, |
| device=self.device) |
|
|
| |
| num_params = sum(p.numel() for p in model.parameters()) / 1e6 |
|
|
| |
| gflops = self._estimate_flops(model, dummy_input) |
|
|
| |
| if self.device == 'cuda': |
| torch.cuda.reset_peak_memory_stats() |
| torch.cuda.synchronize() |
|
|
| |
| for _ in range(self.warmup_iters): |
| _ = model(dummy_input) |
| if self.device == 'cuda': |
| torch.cuda.synchronize() |
|
|
| |
| latencies = [] |
| for _ in range(self.benchmark_iters): |
| if self.device == 'cuda': |
| torch.cuda.synchronize() |
| t0 = time.perf_counter() |
| _ = model(dummy_input) |
| if self.device == 'cuda': |
| torch.cuda.synchronize() |
| latencies.append((time.perf_counter() - t0) * 1000) |
|
|
| latencies = np.array(latencies) |
| gpu_mem = 0 |
| if self.device == 'cuda': |
| gpu_mem = torch.cuda.max_memory_allocated() / 1e6 |
|
|
| result = BenchmarkResult( |
| model_name=model_name, |
| input_size=input_size, |
| device=self.device, |
| batch_size=batch_size, |
| latency_p50_ms=np.percentile(latencies, 50), |
| latency_p95_ms=np.percentile(latencies, 95), |
| latency_p99_ms=np.percentile(latencies, 99), |
| fps=1000 / np.mean(latencies) * batch_size, |
| gpu_mem_mb=gpu_mem, |
| num_params_m=num_params, |
| gflops=gflops, |
| ) |
|
|
| self.results.append(result) |
| return result |
|
|
| def _estimate_flops(self, model: torch.nn.Module, |
| dummy_input: torch.Tensor) -> float: |
| """Estimate GFLOPs (approximate).""" |
| try: |
| from torch.utils.flop_counter import FlopCounterMode |
| flop_counter = FlopCounterMode(display=False) |
| with flop_counter: |
| model(dummy_input) |
| return flop_counter.get_total_flops() / 1e9 |
| except (ImportError, Exception): |
| return 0.0 |
|
|
| def print_results(self): |
| """Print formatted benchmark results table.""" |
| if not self.results: |
| print("No benchmark results yet.") |
| return |
|
|
| header = (f"{'Model':<15} {'Size':<6} {'Device':<8} {'BS':<4} " |
| f"{'P50(ms)':<9} {'P95(ms)':<9} {'P99(ms)':<9} " |
| f"{'FPS':<8} {'Mem(MB)':<10} {'Params(M)':<10} {'GFLOPs':<8}") |
| print("=" * len(header)) |
| print("Speed Benchmark Results") |
| print("=" * len(header)) |
| print(header) |
| print("-" * len(header)) |
|
|
| for r in self.results: |
| print(f"{r.model_name:<15} {r.input_size:<6} {r.device:<8} {r.batch_size:<4} " |
| f"{r.latency_p50_ms:<9.2f} {r.latency_p95_ms:<9.2f} {r.latency_p99_ms:<9.2f} " |
| f"{r.fps:<8.1f} {r.gpu_mem_mb:<10.1f} {r.num_params_m:<10.2f} {r.gflops:<8.2f}") |
|
|
| print("=" * len(header)) |
|
|
| def to_markdown(self) -> str: |
| """Generate Markdown benchmark table.""" |
| lines = [ |
| "| Model | Input | Device | BS | P50 (ms) | P95 (ms) | FPS | GPU Mem (MB) | Params (M) | GFLOPs |", |
| "|-------|-------|--------|----|---------:|---------:|----:|-------------:|-----------:|-------:|", |
| ] |
| for r in self.results: |
| lines.append( |
| f"| {r.model_name} | {r.input_size} | {r.device} | {r.batch_size} | " |
| f"{r.latency_p50_ms:.2f} | {r.latency_p95_ms:.2f} | {r.fps:.1f} | " |
| f"{r.gpu_mem_mb:.1f} | {r.num_params_m:.2f} | {r.gflops:.2f} |" |
| ) |
| return '\n'.join(lines) |
|
|