cledouxluma commited on
Commit
f7417f1
·
verified ·
1 Parent(s): 11246bf

Upload evaluation/speed_benchmark.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. evaluation/speed_benchmark.py +167 -0
evaluation/speed_benchmark.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Speed Benchmark — Measure inference speed across hardware/configs.
3
+
4
+ Reports:
5
+ - Latency (ms per frame) at p50/p95/p99
6
+ - Throughput (FPS)
7
+ - GPU memory usage
8
+ - Comparison across input resolutions and model variants
9
+ """
10
+
11
+ import time
12
+ import numpy as np
13
+ from typing import Dict, Optional, List
14
+ from dataclasses import dataclass
15
+
16
+ import torch
17
+
18
+
19
+ @dataclass
20
+ class BenchmarkResult:
21
+ """Single benchmark measurement."""
22
+ model_name: str
23
+ input_size: int
24
+ device: str
25
+ batch_size: int
26
+ latency_p50_ms: float
27
+ latency_p95_ms: float
28
+ latency_p99_ms: float
29
+ fps: float
30
+ gpu_mem_mb: float
31
+ num_params_m: float
32
+ gflops: float
33
+
34
+
35
+ class SpeedBenchmark:
36
+ """
37
+ Inference speed benchmark for face detection models.
38
+
39
+ Usage:
40
+ bench = SpeedBenchmark(device='cuda')
41
+ result = bench.benchmark_model(model, 'scrfd_34g', input_size=640)
42
+ bench.print_results()
43
+ """
44
+
45
+ def __init__(self, device: str = 'cuda', warmup_iters: int = 50,
46
+ benchmark_iters: int = 200):
47
+ self.device = device
48
+ self.warmup_iters = warmup_iters
49
+ self.benchmark_iters = benchmark_iters
50
+ self.results: List[BenchmarkResult] = []
51
+
52
+ @torch.no_grad()
53
+ def benchmark_model(self, model: torch.nn.Module, model_name: str,
54
+ input_size: int = 640, batch_size: int = 1) -> BenchmarkResult:
55
+ """
56
+ Benchmark a model's inference speed.
57
+
58
+ Args:
59
+ model: PyTorch model in eval mode
60
+ model_name: Name for reporting
61
+ input_size: Input image resolution
62
+ batch_size: Batch size for benchmarking
63
+
64
+ Returns:
65
+ BenchmarkResult with timing statistics
66
+ """
67
+ model = model.to(self.device).eval()
68
+ dummy_input = torch.randn(batch_size, 3, input_size, input_size,
69
+ device=self.device)
70
+
71
+ # Count parameters
72
+ num_params = sum(p.numel() for p in model.parameters()) / 1e6
73
+
74
+ # Estimate GFLOPs (using torch profiler if available)
75
+ gflops = self._estimate_flops(model, dummy_input)
76
+
77
+ # GPU memory before
78
+ if self.device == 'cuda':
79
+ torch.cuda.reset_peak_memory_stats()
80
+ torch.cuda.synchronize()
81
+
82
+ # Warmup
83
+ for _ in range(self.warmup_iters):
84
+ _ = model(dummy_input)
85
+ if self.device == 'cuda':
86
+ torch.cuda.synchronize()
87
+
88
+ # Benchmark
89
+ latencies = []
90
+ for _ in range(self.benchmark_iters):
91
+ if self.device == 'cuda':
92
+ torch.cuda.synchronize()
93
+ t0 = time.perf_counter()
94
+ _ = model(dummy_input)
95
+ if self.device == 'cuda':
96
+ torch.cuda.synchronize()
97
+ latencies.append((time.perf_counter() - t0) * 1000) # ms
98
+
99
+ latencies = np.array(latencies)
100
+ gpu_mem = 0
101
+ if self.device == 'cuda':
102
+ gpu_mem = torch.cuda.max_memory_allocated() / 1e6
103
+
104
+ result = BenchmarkResult(
105
+ model_name=model_name,
106
+ input_size=input_size,
107
+ device=self.device,
108
+ batch_size=batch_size,
109
+ latency_p50_ms=np.percentile(latencies, 50),
110
+ latency_p95_ms=np.percentile(latencies, 95),
111
+ latency_p99_ms=np.percentile(latencies, 99),
112
+ fps=1000 / np.mean(latencies) * batch_size,
113
+ gpu_mem_mb=gpu_mem,
114
+ num_params_m=num_params,
115
+ gflops=gflops,
116
+ )
117
+
118
+ self.results.append(result)
119
+ return result
120
+
121
+ def _estimate_flops(self, model: torch.nn.Module,
122
+ dummy_input: torch.Tensor) -> float:
123
+ """Estimate GFLOPs (approximate)."""
124
+ try:
125
+ from torch.utils.flop_counter import FlopCounterMode
126
+ flop_counter = FlopCounterMode(display=False)
127
+ with flop_counter:
128
+ model(dummy_input)
129
+ return flop_counter.get_total_flops() / 1e9
130
+ except (ImportError, Exception):
131
+ return 0.0
132
+
133
+ def print_results(self):
134
+ """Print formatted benchmark results table."""
135
+ if not self.results:
136
+ print("No benchmark results yet.")
137
+ return
138
+
139
+ header = (f"{'Model':<15} {'Size':<6} {'Device':<8} {'BS':<4} "
140
+ f"{'P50(ms)':<9} {'P95(ms)':<9} {'P99(ms)':<9} "
141
+ f"{'FPS':<8} {'Mem(MB)':<10} {'Params(M)':<10} {'GFLOPs':<8}")
142
+ print("=" * len(header))
143
+ print("Speed Benchmark Results")
144
+ print("=" * len(header))
145
+ print(header)
146
+ print("-" * len(header))
147
+
148
+ for r in self.results:
149
+ print(f"{r.model_name:<15} {r.input_size:<6} {r.device:<8} {r.batch_size:<4} "
150
+ f"{r.latency_p50_ms:<9.2f} {r.latency_p95_ms:<9.2f} {r.latency_p99_ms:<9.2f} "
151
+ f"{r.fps:<8.1f} {r.gpu_mem_mb:<10.1f} {r.num_params_m:<10.2f} {r.gflops:<8.2f}")
152
+
153
+ print("=" * len(header))
154
+
155
+ def to_markdown(self) -> str:
156
+ """Generate Markdown benchmark table."""
157
+ lines = [
158
+ "| Model | Input | Device | BS | P50 (ms) | P95 (ms) | FPS | GPU Mem (MB) | Params (M) | GFLOPs |",
159
+ "|-------|-------|--------|----|---------:|---------:|----:|-------------:|-----------:|-------:|",
160
+ ]
161
+ for r in self.results:
162
+ lines.append(
163
+ f"| {r.model_name} | {r.input_size} | {r.device} | {r.batch_size} | "
164
+ f"{r.latency_p50_ms:.2f} | {r.latency_p95_ms:.2f} | {r.fps:.1f} | "
165
+ f"{r.gpu_mem_mb:.1f} | {r.num_params_m:.2f} | {r.gflops:.2f} |"
166
+ )
167
+ return '\n'.join(lines)