File size: 4,518 Bytes
20cc740
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""
Post-export optimization: quantization, TensorRT, and deployment helpers.
"""

import os
from typing import Optional

import torch
import torch.nn as nn


def quantize_model(model: nn.Module, calibration_data=None,
                   method: str = 'dynamic') -> nn.Module:
    """
    Quantize model for faster CPU inference.

    Args:
        model: PyTorch model
        calibration_data: DataLoader for static quantization calibration
        method: 'dynamic' (no calibration needed) or 'static'

    Returns:
        Quantized model
    """
    model.eval().cpu()

    if method == 'dynamic':
        quantized = torch.quantization.quantize_dynamic(
            model,
            {nn.Linear, nn.Conv2d},
            dtype=torch.qint8,
        )
        print("Dynamic quantization complete")
        return quantized

    elif method == 'static':
        if calibration_data is None:
            raise ValueError("Static quantization requires calibration_data")

        model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
        prepared = torch.quantization.prepare(model, inplace=False)

        # Calibrate
        with torch.no_grad():
            for images, _ in calibration_data:
                prepared(images)

        quantized = torch.quantization.convert(prepared, inplace=False)
        print("Static quantization complete")
        return quantized

    else:
        raise ValueError(f"Unknown quantization method: {method}")


def benchmark_deployment(onnx_path: str, input_size: int = 640,
                         num_runs: int = 100) -> dict:
    """
    Benchmark ONNX model inference speed.

    Returns dict with latency and throughput stats.
    """
    import time
    import numpy as np

    try:
        import onnxruntime as ort
    except ImportError:
        return {"error": "onnxruntime not installed"}

    providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
    session = ort.InferenceSession(onnx_path, providers=providers)

    dummy = np.random.randn(1, 3, input_size, input_size).astype(np.float32)
    input_name = session.get_inputs()[0].name

    # Warmup
    for _ in range(20):
        session.run(None, {input_name: dummy})

    # Benchmark
    latencies = []
    for _ in range(num_runs):
        t0 = time.perf_counter()
        session.run(None, {input_name: dummy})
        latencies.append((time.perf_counter() - t0) * 1000)

    latencies = np.array(latencies)

    return {
        'onnx_path': onnx_path,
        'input_size': input_size,
        'latency_p50_ms': np.percentile(latencies, 50),
        'latency_p95_ms': np.percentile(latencies, 95),
        'fps': 1000 / np.mean(latencies),
        'provider': session.get_providers()[0],
    }


# ──────────────────────── TensorRT Guide ────────────────────────

TENSORRT_GUIDE = """
# TensorRT Optimization Guide for SCRFD

## Prerequisites
pip install tensorrt  # or use NVIDIA TensorRT container

## Convert ONNX to TensorRT Engine

### FP16 (recommended for production GPU deployment)
trtexec --onnx=scrfd_34g.onnx \\
        --saveEngine=scrfd_34g_fp16.engine \\
        --fp16 \\
        --workspace=4096 \\
        --minShapes=input:1x3x640x640 \\
        --optShapes=input:1x3x640x640 \\
        --maxShapes=input:8x3x640x640

### INT8 (fastest, requires calibration)
trtexec --onnx=scrfd_34g.onnx \\
        --saveEngine=scrfd_34g_int8.engine \\
        --int8 \\
        --calib=calibration_cache.bin \\
        --workspace=4096

### Dynamic batch size
trtexec --onnx=scrfd_34g.onnx \\
        --saveEngine=scrfd_34g_dynamic.engine \\
        --fp16 \\
        --minShapes=input:1x3x640x640 \\
        --optShapes=input:4x3x640x640 \\
        --maxShapes=input:16x3x640x640

## Expected Speedups (V100)
| Model      | PyTorch FP32 | ONNX Runtime | TensorRT FP16 | TensorRT INT8 |
|------------|-------------|-------------|----------------|---------------|
| SCRFD-34G  | ~80 FPS     | ~100 FPS    | ~200 FPS       | ~350 FPS      |
| SCRFD-2.5G | ~400 FPS    | ~500 FPS    | ~800 FPS       | ~1200 FPS     |
| SCRFD-0.5G | ~1000 FPS   | ~1200 FPS   | ~2000 FPS      | ~3000 FPS     |

## INT8 Calibration
Use the calibration script:
  python scripts/tensorrt_calibrate.py \\
      --onnx scrfd_34g.onnx \\
      --data-root data/wider_face \\
      --num-images 500

## Deployment with Triton Inference Server
See configs/triton/ for model repository configuration.
"""