Upload deploy/optimize.py with huggingface_hub
Browse files- deploy/optimize.py +150 -0
deploy/optimize.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Post-export optimization: quantization, TensorRT, and deployment helpers.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn as nn
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def quantize_model(model: nn.Module, calibration_data=None,
|
| 13 |
+
method: str = 'dynamic') -> nn.Module:
|
| 14 |
+
"""
|
| 15 |
+
Quantize model for faster CPU inference.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
model: PyTorch model
|
| 19 |
+
calibration_data: DataLoader for static quantization calibration
|
| 20 |
+
method: 'dynamic' (no calibration needed) or 'static'
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
Quantized model
|
| 24 |
+
"""
|
| 25 |
+
model.eval().cpu()
|
| 26 |
+
|
| 27 |
+
if method == 'dynamic':
|
| 28 |
+
quantized = torch.quantization.quantize_dynamic(
|
| 29 |
+
model,
|
| 30 |
+
{nn.Linear, nn.Conv2d},
|
| 31 |
+
dtype=torch.qint8,
|
| 32 |
+
)
|
| 33 |
+
print("Dynamic quantization complete")
|
| 34 |
+
return quantized
|
| 35 |
+
|
| 36 |
+
elif method == 'static':
|
| 37 |
+
if calibration_data is None:
|
| 38 |
+
raise ValueError("Static quantization requires calibration_data")
|
| 39 |
+
|
| 40 |
+
model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
|
| 41 |
+
prepared = torch.quantization.prepare(model, inplace=False)
|
| 42 |
+
|
| 43 |
+
# Calibrate
|
| 44 |
+
with torch.no_grad():
|
| 45 |
+
for images, _ in calibration_data:
|
| 46 |
+
prepared(images)
|
| 47 |
+
|
| 48 |
+
quantized = torch.quantization.convert(prepared, inplace=False)
|
| 49 |
+
print("Static quantization complete")
|
| 50 |
+
return quantized
|
| 51 |
+
|
| 52 |
+
else:
|
| 53 |
+
raise ValueError(f"Unknown quantization method: {method}")
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def benchmark_deployment(onnx_path: str, input_size: int = 640,
|
| 57 |
+
num_runs: int = 100) -> dict:
|
| 58 |
+
"""
|
| 59 |
+
Benchmark ONNX model inference speed.
|
| 60 |
+
|
| 61 |
+
Returns dict with latency and throughput stats.
|
| 62 |
+
"""
|
| 63 |
+
import time
|
| 64 |
+
import numpy as np
|
| 65 |
+
|
| 66 |
+
try:
|
| 67 |
+
import onnxruntime as ort
|
| 68 |
+
except ImportError:
|
| 69 |
+
return {"error": "onnxruntime not installed"}
|
| 70 |
+
|
| 71 |
+
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
|
| 72 |
+
session = ort.InferenceSession(onnx_path, providers=providers)
|
| 73 |
+
|
| 74 |
+
dummy = np.random.randn(1, 3, input_size, input_size).astype(np.float32)
|
| 75 |
+
input_name = session.get_inputs()[0].name
|
| 76 |
+
|
| 77 |
+
# Warmup
|
| 78 |
+
for _ in range(20):
|
| 79 |
+
session.run(None, {input_name: dummy})
|
| 80 |
+
|
| 81 |
+
# Benchmark
|
| 82 |
+
latencies = []
|
| 83 |
+
for _ in range(num_runs):
|
| 84 |
+
t0 = time.perf_counter()
|
| 85 |
+
session.run(None, {input_name: dummy})
|
| 86 |
+
latencies.append((time.perf_counter() - t0) * 1000)
|
| 87 |
+
|
| 88 |
+
latencies = np.array(latencies)
|
| 89 |
+
|
| 90 |
+
return {
|
| 91 |
+
'onnx_path': onnx_path,
|
| 92 |
+
'input_size': input_size,
|
| 93 |
+
'latency_p50_ms': np.percentile(latencies, 50),
|
| 94 |
+
'latency_p95_ms': np.percentile(latencies, 95),
|
| 95 |
+
'fps': 1000 / np.mean(latencies),
|
| 96 |
+
'provider': session.get_providers()[0],
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# ββββββββββββββββββββββββ TensorRT Guide ββββββββββββββββββββββββ
|
| 101 |
+
|
| 102 |
+
TENSORRT_GUIDE = """
|
| 103 |
+
# TensorRT Optimization Guide for SCRFD
|
| 104 |
+
|
| 105 |
+
## Prerequisites
|
| 106 |
+
pip install tensorrt # or use NVIDIA TensorRT container
|
| 107 |
+
|
| 108 |
+
## Convert ONNX to TensorRT Engine
|
| 109 |
+
|
| 110 |
+
### FP16 (recommended for production GPU deployment)
|
| 111 |
+
trtexec --onnx=scrfd_34g.onnx \\
|
| 112 |
+
--saveEngine=scrfd_34g_fp16.engine \\
|
| 113 |
+
--fp16 \\
|
| 114 |
+
--workspace=4096 \\
|
| 115 |
+
--minShapes=input:1x3x640x640 \\
|
| 116 |
+
--optShapes=input:1x3x640x640 \\
|
| 117 |
+
--maxShapes=input:8x3x640x640
|
| 118 |
+
|
| 119 |
+
### INT8 (fastest, requires calibration)
|
| 120 |
+
trtexec --onnx=scrfd_34g.onnx \\
|
| 121 |
+
--saveEngine=scrfd_34g_int8.engine \\
|
| 122 |
+
--int8 \\
|
| 123 |
+
--calib=calibration_cache.bin \\
|
| 124 |
+
--workspace=4096
|
| 125 |
+
|
| 126 |
+
### Dynamic batch size
|
| 127 |
+
trtexec --onnx=scrfd_34g.onnx \\
|
| 128 |
+
--saveEngine=scrfd_34g_dynamic.engine \\
|
| 129 |
+
--fp16 \\
|
| 130 |
+
--minShapes=input:1x3x640x640 \\
|
| 131 |
+
--optShapes=input:4x3x640x640 \\
|
| 132 |
+
--maxShapes=input:16x3x640x640
|
| 133 |
+
|
| 134 |
+
## Expected Speedups (V100)
|
| 135 |
+
| Model | PyTorch FP32 | ONNX Runtime | TensorRT FP16 | TensorRT INT8 |
|
| 136 |
+
|------------|-------------|-------------|----------------|---------------|
|
| 137 |
+
| SCRFD-34G | ~80 FPS | ~100 FPS | ~200 FPS | ~350 FPS |
|
| 138 |
+
| SCRFD-2.5G | ~400 FPS | ~500 FPS | ~800 FPS | ~1200 FPS |
|
| 139 |
+
| SCRFD-0.5G | ~1000 FPS | ~1200 FPS | ~2000 FPS | ~3000 FPS |
|
| 140 |
+
|
| 141 |
+
## INT8 Calibration
|
| 142 |
+
Use the calibration script:
|
| 143 |
+
python scripts/tensorrt_calibrate.py \\
|
| 144 |
+
--onnx scrfd_34g.onnx \\
|
| 145 |
+
--data-root data/wider_face \\
|
| 146 |
+
--num-images 500
|
| 147 |
+
|
| 148 |
+
## Deployment with Triton Inference Server
|
| 149 |
+
See configs/triton/ for model repository configuration.
|
| 150 |
+
"""
|