cledouxluma commited on
Commit
20cc740
Β·
verified Β·
1 Parent(s): 3be24fc

Upload deploy/optimize.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. deploy/optimize.py +150 -0
deploy/optimize.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Post-export optimization: quantization, TensorRT, and deployment helpers.
3
+ """
4
+
5
+ import os
6
+ from typing import Optional
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+
11
+
12
+ def quantize_model(model: nn.Module, calibration_data=None,
13
+ method: str = 'dynamic') -> nn.Module:
14
+ """
15
+ Quantize model for faster CPU inference.
16
+
17
+ Args:
18
+ model: PyTorch model
19
+ calibration_data: DataLoader for static quantization calibration
20
+ method: 'dynamic' (no calibration needed) or 'static'
21
+
22
+ Returns:
23
+ Quantized model
24
+ """
25
+ model.eval().cpu()
26
+
27
+ if method == 'dynamic':
28
+ quantized = torch.quantization.quantize_dynamic(
29
+ model,
30
+ {nn.Linear, nn.Conv2d},
31
+ dtype=torch.qint8,
32
+ )
33
+ print("Dynamic quantization complete")
34
+ return quantized
35
+
36
+ elif method == 'static':
37
+ if calibration_data is None:
38
+ raise ValueError("Static quantization requires calibration_data")
39
+
40
+ model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
41
+ prepared = torch.quantization.prepare(model, inplace=False)
42
+
43
+ # Calibrate
44
+ with torch.no_grad():
45
+ for images, _ in calibration_data:
46
+ prepared(images)
47
+
48
+ quantized = torch.quantization.convert(prepared, inplace=False)
49
+ print("Static quantization complete")
50
+ return quantized
51
+
52
+ else:
53
+ raise ValueError(f"Unknown quantization method: {method}")
54
+
55
+
56
+ def benchmark_deployment(onnx_path: str, input_size: int = 640,
57
+ num_runs: int = 100) -> dict:
58
+ """
59
+ Benchmark ONNX model inference speed.
60
+
61
+ Returns dict with latency and throughput stats.
62
+ """
63
+ import time
64
+ import numpy as np
65
+
66
+ try:
67
+ import onnxruntime as ort
68
+ except ImportError:
69
+ return {"error": "onnxruntime not installed"}
70
+
71
+ providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
72
+ session = ort.InferenceSession(onnx_path, providers=providers)
73
+
74
+ dummy = np.random.randn(1, 3, input_size, input_size).astype(np.float32)
75
+ input_name = session.get_inputs()[0].name
76
+
77
+ # Warmup
78
+ for _ in range(20):
79
+ session.run(None, {input_name: dummy})
80
+
81
+ # Benchmark
82
+ latencies = []
83
+ for _ in range(num_runs):
84
+ t0 = time.perf_counter()
85
+ session.run(None, {input_name: dummy})
86
+ latencies.append((time.perf_counter() - t0) * 1000)
87
+
88
+ latencies = np.array(latencies)
89
+
90
+ return {
91
+ 'onnx_path': onnx_path,
92
+ 'input_size': input_size,
93
+ 'latency_p50_ms': np.percentile(latencies, 50),
94
+ 'latency_p95_ms': np.percentile(latencies, 95),
95
+ 'fps': 1000 / np.mean(latencies),
96
+ 'provider': session.get_providers()[0],
97
+ }
98
+
99
+
100
+ # ──────────────────────── TensorRT Guide ────────────────────────
101
+
102
+ TENSORRT_GUIDE = """
103
+ # TensorRT Optimization Guide for SCRFD
104
+
105
+ ## Prerequisites
106
+ pip install tensorrt # or use NVIDIA TensorRT container
107
+
108
+ ## Convert ONNX to TensorRT Engine
109
+
110
+ ### FP16 (recommended for production GPU deployment)
111
+ trtexec --onnx=scrfd_34g.onnx \\
112
+ --saveEngine=scrfd_34g_fp16.engine \\
113
+ --fp16 \\
114
+ --workspace=4096 \\
115
+ --minShapes=input:1x3x640x640 \\
116
+ --optShapes=input:1x3x640x640 \\
117
+ --maxShapes=input:8x3x640x640
118
+
119
+ ### INT8 (fastest, requires calibration)
120
+ trtexec --onnx=scrfd_34g.onnx \\
121
+ --saveEngine=scrfd_34g_int8.engine \\
122
+ --int8 \\
123
+ --calib=calibration_cache.bin \\
124
+ --workspace=4096
125
+
126
+ ### Dynamic batch size
127
+ trtexec --onnx=scrfd_34g.onnx \\
128
+ --saveEngine=scrfd_34g_dynamic.engine \\
129
+ --fp16 \\
130
+ --minShapes=input:1x3x640x640 \\
131
+ --optShapes=input:4x3x640x640 \\
132
+ --maxShapes=input:16x3x640x640
133
+
134
+ ## Expected Speedups (V100)
135
+ | Model | PyTorch FP32 | ONNX Runtime | TensorRT FP16 | TensorRT INT8 |
136
+ |------------|-------------|-------------|----------------|---------------|
137
+ | SCRFD-34G | ~80 FPS | ~100 FPS | ~200 FPS | ~350 FPS |
138
+ | SCRFD-2.5G | ~400 FPS | ~500 FPS | ~800 FPS | ~1200 FPS |
139
+ | SCRFD-0.5G | ~1000 FPS | ~1200 FPS | ~2000 FPS | ~3000 FPS |
140
+
141
+ ## INT8 Calibration
142
+ Use the calibration script:
143
+ python scripts/tensorrt_calibrate.py \\
144
+ --onnx scrfd_34g.onnx \\
145
+ --data-root data/wider_face \\
146
+ --num-images 500
147
+
148
+ ## Deployment with Triton Inference Server
149
+ See configs/triton/ for model repository configuration.
150
+ """