File size: 4,518 Bytes
20cc740 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | """
Post-export optimization: quantization, TensorRT, and deployment helpers.
"""
import os
from typing import Optional
import torch
import torch.nn as nn
def quantize_model(model: nn.Module, calibration_data=None,
method: str = 'dynamic') -> nn.Module:
"""
Quantize model for faster CPU inference.
Args:
model: PyTorch model
calibration_data: DataLoader for static quantization calibration
method: 'dynamic' (no calibration needed) or 'static'
Returns:
Quantized model
"""
model.eval().cpu()
if method == 'dynamic':
quantized = torch.quantization.quantize_dynamic(
model,
{nn.Linear, nn.Conv2d},
dtype=torch.qint8,
)
print("Dynamic quantization complete")
return quantized
elif method == 'static':
if calibration_data is None:
raise ValueError("Static quantization requires calibration_data")
model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
prepared = torch.quantization.prepare(model, inplace=False)
# Calibrate
with torch.no_grad():
for images, _ in calibration_data:
prepared(images)
quantized = torch.quantization.convert(prepared, inplace=False)
print("Static quantization complete")
return quantized
else:
raise ValueError(f"Unknown quantization method: {method}")
def benchmark_deployment(onnx_path: str, input_size: int = 640,
num_runs: int = 100) -> dict:
"""
Benchmark ONNX model inference speed.
Returns dict with latency and throughput stats.
"""
import time
import numpy as np
try:
import onnxruntime as ort
except ImportError:
return {"error": "onnxruntime not installed"}
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
session = ort.InferenceSession(onnx_path, providers=providers)
dummy = np.random.randn(1, 3, input_size, input_size).astype(np.float32)
input_name = session.get_inputs()[0].name
# Warmup
for _ in range(20):
session.run(None, {input_name: dummy})
# Benchmark
latencies = []
for _ in range(num_runs):
t0 = time.perf_counter()
session.run(None, {input_name: dummy})
latencies.append((time.perf_counter() - t0) * 1000)
latencies = np.array(latencies)
return {
'onnx_path': onnx_path,
'input_size': input_size,
'latency_p50_ms': np.percentile(latencies, 50),
'latency_p95_ms': np.percentile(latencies, 95),
'fps': 1000 / np.mean(latencies),
'provider': session.get_providers()[0],
}
# ββββββββββββββββββββββββ TensorRT Guide ββββββββββββββββββββββββ
TENSORRT_GUIDE = """
# TensorRT Optimization Guide for SCRFD
## Prerequisites
pip install tensorrt # or use NVIDIA TensorRT container
## Convert ONNX to TensorRT Engine
### FP16 (recommended for production GPU deployment)
trtexec --onnx=scrfd_34g.onnx \\
--saveEngine=scrfd_34g_fp16.engine \\
--fp16 \\
--workspace=4096 \\
--minShapes=input:1x3x640x640 \\
--optShapes=input:1x3x640x640 \\
--maxShapes=input:8x3x640x640
### INT8 (fastest, requires calibration)
trtexec --onnx=scrfd_34g.onnx \\
--saveEngine=scrfd_34g_int8.engine \\
--int8 \\
--calib=calibration_cache.bin \\
--workspace=4096
### Dynamic batch size
trtexec --onnx=scrfd_34g.onnx \\
--saveEngine=scrfd_34g_dynamic.engine \\
--fp16 \\
--minShapes=input:1x3x640x640 \\
--optShapes=input:4x3x640x640 \\
--maxShapes=input:16x3x640x640
## Expected Speedups (V100)
| Model | PyTorch FP32 | ONNX Runtime | TensorRT FP16 | TensorRT INT8 |
|------------|-------------|-------------|----------------|---------------|
| SCRFD-34G | ~80 FPS | ~100 FPS | ~200 FPS | ~350 FPS |
| SCRFD-2.5G | ~400 FPS | ~500 FPS | ~800 FPS | ~1200 FPS |
| SCRFD-0.5G | ~1000 FPS | ~1200 FPS | ~2000 FPS | ~3000 FPS |
## INT8 Calibration
Use the calibration script:
python scripts/tensorrt_calibrate.py \\
--onnx scrfd_34g.onnx \\
--data-root data/wider_face \\
--num-images 500
## Deployment with Triton Inference Server
See configs/triton/ for model repository configuration.
"""
|