File size: 5,629 Bytes
f7417f1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | """
Speed Benchmark — Measure inference speed across hardware/configs.
Reports:
- Latency (ms per frame) at p50/p95/p99
- Throughput (FPS)
- GPU memory usage
- Comparison across input resolutions and model variants
"""
import time
import numpy as np
from typing import Dict, Optional, List
from dataclasses import dataclass
import torch
@dataclass
class BenchmarkResult:
"""Single benchmark measurement."""
model_name: str
input_size: int
device: str
batch_size: int
latency_p50_ms: float
latency_p95_ms: float
latency_p99_ms: float
fps: float
gpu_mem_mb: float
num_params_m: float
gflops: float
class SpeedBenchmark:
"""
Inference speed benchmark for face detection models.
Usage:
bench = SpeedBenchmark(device='cuda')
result = bench.benchmark_model(model, 'scrfd_34g', input_size=640)
bench.print_results()
"""
def __init__(self, device: str = 'cuda', warmup_iters: int = 50,
benchmark_iters: int = 200):
self.device = device
self.warmup_iters = warmup_iters
self.benchmark_iters = benchmark_iters
self.results: List[BenchmarkResult] = []
@torch.no_grad()
def benchmark_model(self, model: torch.nn.Module, model_name: str,
input_size: int = 640, batch_size: int = 1) -> BenchmarkResult:
"""
Benchmark a model's inference speed.
Args:
model: PyTorch model in eval mode
model_name: Name for reporting
input_size: Input image resolution
batch_size: Batch size for benchmarking
Returns:
BenchmarkResult with timing statistics
"""
model = model.to(self.device).eval()
dummy_input = torch.randn(batch_size, 3, input_size, input_size,
device=self.device)
# Count parameters
num_params = sum(p.numel() for p in model.parameters()) / 1e6
# Estimate GFLOPs (using torch profiler if available)
gflops = self._estimate_flops(model, dummy_input)
# GPU memory before
if self.device == 'cuda':
torch.cuda.reset_peak_memory_stats()
torch.cuda.synchronize()
# Warmup
for _ in range(self.warmup_iters):
_ = model(dummy_input)
if self.device == 'cuda':
torch.cuda.synchronize()
# Benchmark
latencies = []
for _ in range(self.benchmark_iters):
if self.device == 'cuda':
torch.cuda.synchronize()
t0 = time.perf_counter()
_ = model(dummy_input)
if self.device == 'cuda':
torch.cuda.synchronize()
latencies.append((time.perf_counter() - t0) * 1000) # ms
latencies = np.array(latencies)
gpu_mem = 0
if self.device == 'cuda':
gpu_mem = torch.cuda.max_memory_allocated() / 1e6
result = BenchmarkResult(
model_name=model_name,
input_size=input_size,
device=self.device,
batch_size=batch_size,
latency_p50_ms=np.percentile(latencies, 50),
latency_p95_ms=np.percentile(latencies, 95),
latency_p99_ms=np.percentile(latencies, 99),
fps=1000 / np.mean(latencies) * batch_size,
gpu_mem_mb=gpu_mem,
num_params_m=num_params,
gflops=gflops,
)
self.results.append(result)
return result
def _estimate_flops(self, model: torch.nn.Module,
dummy_input: torch.Tensor) -> float:
"""Estimate GFLOPs (approximate)."""
try:
from torch.utils.flop_counter import FlopCounterMode
flop_counter = FlopCounterMode(display=False)
with flop_counter:
model(dummy_input)
return flop_counter.get_total_flops() / 1e9
except (ImportError, Exception):
return 0.0
def print_results(self):
"""Print formatted benchmark results table."""
if not self.results:
print("No benchmark results yet.")
return
header = (f"{'Model':<15} {'Size':<6} {'Device':<8} {'BS':<4} "
f"{'P50(ms)':<9} {'P95(ms)':<9} {'P99(ms)':<9} "
f"{'FPS':<8} {'Mem(MB)':<10} {'Params(M)':<10} {'GFLOPs':<8}")
print("=" * len(header))
print("Speed Benchmark Results")
print("=" * len(header))
print(header)
print("-" * len(header))
for r in self.results:
print(f"{r.model_name:<15} {r.input_size:<6} {r.device:<8} {r.batch_size:<4} "
f"{r.latency_p50_ms:<9.2f} {r.latency_p95_ms:<9.2f} {r.latency_p99_ms:<9.2f} "
f"{r.fps:<8.1f} {r.gpu_mem_mb:<10.1f} {r.num_params_m:<10.2f} {r.gflops:<8.2f}")
print("=" * len(header))
def to_markdown(self) -> str:
"""Generate Markdown benchmark table."""
lines = [
"| Model | Input | Device | BS | P50 (ms) | P95 (ms) | FPS | GPU Mem (MB) | Params (M) | GFLOPs |",
"|-------|-------|--------|----|---------:|---------:|----:|-------------:|-----------:|-------:|",
]
for r in self.results:
lines.append(
f"| {r.model_name} | {r.input_size} | {r.device} | {r.batch_size} | "
f"{r.latency_p50_ms:.2f} | {r.latency_p95_ms:.2f} | {r.fps:.1f} | "
f"{r.gpu_mem_mb:.1f} | {r.num_params_m:.2f} | {r.gflops:.2f} |"
)
return '\n'.join(lines)
|