cledouxluma
/

facedet

Model card Files Files and versions

xet

Community

cledouxluma commited on 15 days ago

Commit

f7417f1

verified ·

1 Parent(s): 11246bf

Upload evaluation/speed_benchmark.py with huggingface_hub

Browse files

Files changed (1) hide show

evaluation/speed_benchmark.py +167 -0

evaluation/speed_benchmark.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""
+Speed Benchmark — Measure inference speed across hardware/configs.
+Reports:
+- Latency (ms per frame) at p50/p95/p99
+- Throughput (FPS)
+- GPU memory usage
+- Comparison across input resolutions and model variants
+"""
+import time
+import numpy as np
+from typing import Dict, Optional, List
+from dataclasses import dataclass
+import torch
+@dataclass
+class BenchmarkResult:
+    """Single benchmark measurement."""
+    model_name: str
+    input_size: int
+    device: str
+    batch_size: int
+    latency_p50_ms: float
+    latency_p95_ms: float
+    latency_p99_ms: float
+    fps: float
+    gpu_mem_mb: float
+    num_params_m: float
+    gflops: float
+class SpeedBenchmark:
+    """
+    Inference speed benchmark for face detection models.
+    Usage:
+        bench = SpeedBenchmark(device='cuda')
+        result = bench.benchmark_model(model, 'scrfd_34g', input_size=640)
+        bench.print_results()
+    """
+    def __init__(self, device: str = 'cuda', warmup_iters: int = 50,
+                 benchmark_iters: int = 200):
+        self.device = device
+        self.warmup_iters = warmup_iters
+        self.benchmark_iters = benchmark_iters
+        self.results: List[BenchmarkResult] = []
+    @torch.no_grad()
+    def benchmark_model(self, model: torch.nn.Module, model_name: str,
+                        input_size: int = 640, batch_size: int = 1) -> BenchmarkResult:
+        """
+        Benchmark a model's inference speed.
+        Args:
+            model: PyTorch model in eval mode
+            model_name: Name for reporting
+            input_size: Input image resolution
+            batch_size: Batch size for benchmarking
+        Returns:
+            BenchmarkResult with timing statistics
+        """
+        model = model.to(self.device).eval()
+        dummy_input = torch.randn(batch_size, 3, input_size, input_size,
+                                  device=self.device)
+        # Count parameters
+        num_params = sum(p.numel() for p in model.parameters()) / 1e6
+        # Estimate GFLOPs (using torch profiler if available)
+        gflops = self._estimate_flops(model, dummy_input)
+        # GPU memory before
+        if self.device == 'cuda':
+            torch.cuda.reset_peak_memory_stats()
+            torch.cuda.synchronize()
+        # Warmup
+        for _ in range(self.warmup_iters):
+            _ = model(dummy_input)
+            if self.device == 'cuda':
+                torch.cuda.synchronize()
+        # Benchmark
+        latencies = []
+        for _ in range(self.benchmark_iters):
+            if self.device == 'cuda':
+                torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            _ = model(dummy_input)
+            if self.device == 'cuda':
+                torch.cuda.synchronize()
+            latencies.append((time.perf_counter() - t0) * 1000)  # ms
+        latencies = np.array(latencies)
+        gpu_mem = 0
+        if self.device == 'cuda':
+            gpu_mem = torch.cuda.max_memory_allocated() / 1e6
+        result = BenchmarkResult(
+            model_name=model_name,
+            input_size=input_size,
+            device=self.device,
+            batch_size=batch_size,
+            latency_p50_ms=np.percentile(latencies, 50),
+            latency_p95_ms=np.percentile(latencies, 95),
+            latency_p99_ms=np.percentile(latencies, 99),
+            fps=1000 / np.mean(latencies) * batch_size,
+            gpu_mem_mb=gpu_mem,
+            num_params_m=num_params,
+            gflops=gflops,
+        )
+        self.results.append(result)
+        return result
+    def _estimate_flops(self, model: torch.nn.Module,
+                        dummy_input: torch.Tensor) -> float:
+        """Estimate GFLOPs (approximate)."""
+        try:
+            from torch.utils.flop_counter import FlopCounterMode
+            flop_counter = FlopCounterMode(display=False)
+            with flop_counter:
+                model(dummy_input)
+            return flop_counter.get_total_flops() / 1e9
+        except (ImportError, Exception):
+            return 0.0
+    def print_results(self):
+        """Print formatted benchmark results table."""
+        if not self.results:
+            print("No benchmark results yet.")
+            return
+        header = (f"{'Model':<15} {'Size':<6} {'Device':<8} {'BS':<4} "
+                  f"{'P50(ms)':<9} {'P95(ms)':<9} {'P99(ms)':<9} "
+                  f"{'FPS':<8} {'Mem(MB)':<10} {'Params(M)':<10} {'GFLOPs':<8}")
+        print("=" * len(header))
+        print("Speed Benchmark Results")
+        print("=" * len(header))
+        print(header)
+        print("-" * len(header))
+        for r in self.results:
+            print(f"{r.model_name:<15} {r.input_size:<6} {r.device:<8} {r.batch_size:<4} "
+                  f"{r.latency_p50_ms:<9.2f} {r.latency_p95_ms:<9.2f} {r.latency_p99_ms:<9.2f} "
+                  f"{r.fps:<8.1f} {r.gpu_mem_mb:<10.1f} {r.num_params_m:<10.2f} {r.gflops:<8.2f}")
+        print("=" * len(header))
+    def to_markdown(self) -> str:
+        """Generate Markdown benchmark table."""
+        lines = [
+            "| Model | Input | Device | BS | P50 (ms) | P95 (ms) | FPS | GPU Mem (MB) | Params (M) | GFLOPs |",
+            "|-------|-------|--------|----|---------:|---------:|----:|-------------:|-----------:|-------:|",
+        ]
+        for r in self.results:
+            lines.append(
+                f"| {r.model_name} | {r.input_size} | {r.device} | {r.batch_size} | "
+                f"{r.latency_p50_ms:.2f} | {r.latency_p95_ms:.2f} | {r.fps:.1f} | "
+                f"{r.gpu_mem_mb:.1f} | {r.num_params_m:.2f} | {r.gflops:.2f} |"
+            )
+        return '\n'.join(lines)