aitask1024 commited on
Commit
cf23a82
·
verified ·
1 Parent(s): 049b44d

Upload miner.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. miner.py +314 -0
miner.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import math
3
+
4
+ import cv2
5
+ import numpy as np
6
+ import onnxruntime as ort
7
+ from numpy import ndarray
8
+ from pydantic import BaseModel
9
+
10
+
11
+ class BoundingBox(BaseModel):
12
+ x1: int
13
+ y1: int
14
+ x2: int
15
+ y2: int
16
+ cls_id: int
17
+ conf: float
18
+
19
+
20
+ class TVFrameResult(BaseModel):
21
+ frame_id: int
22
+ boxes: list[BoundingBox]
23
+ keypoints: list[tuple[int, int]]
24
+
25
+
26
+ class Miner:
27
+ def __init__(self, path_hf_repo: Path) -> None:
28
+ model_path = path_hf_repo / "weights.onnx"
29
+ self.class_names = ["person"]
30
+
31
+ sess_options = ort.SessionOptions()
32
+ sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
33
+
34
+ try:
35
+ self.session = ort.InferenceSession(
36
+ str(model_path),
37
+ sess_options=sess_options,
38
+ providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
39
+ )
40
+ except Exception:
41
+ self.session = ort.InferenceSession(
42
+ str(model_path),
43
+ sess_options=sess_options,
44
+ providers=["CPUExecutionProvider"],
45
+ )
46
+
47
+ self.input_name = self.session.get_inputs()[0].name
48
+ self.output_names = [o.name for o in self.session.get_outputs()]
49
+ self.input_shape = self.session.get_inputs()[0].shape
50
+ self.input_height = self._safe_dim(self.input_shape[2], 1280)
51
+ self.input_width = self._safe_dim(self.input_shape[3], 1280)
52
+
53
+ # Tuned for MAP50 (65%) + FALSE_POSITIVE (35%) scoring
54
+ # Lower conf = more recall = higher MAP50, but more FP
55
+ # Balance: slightly aggressive recall since MAP50 weight > FP weight
56
+ self.conf_thres = 0.40
57
+ self.conf_high = 0.55
58
+ self.iou_thres = 0.50
59
+ self.tta_match_iou = 0.45
60
+ self.max_det = 200
61
+ self.use_tta = True
62
+
63
+ # Box sanity filters
64
+ self.min_box_area = 12 * 12
65
+ self.min_w = 6
66
+ self.min_h = 6
67
+ self.max_aspect_ratio = 7.0
68
+ self.max_box_area_ratio = 0.85
69
+
70
+ print(f"Model loaded: {model_path}, providers={self.session.get_providers()}")
71
+
72
+ def __repr__(self) -> str:
73
+ return f"ONNXRuntime(providers={self.session.get_providers()})"
74
+
75
+ @staticmethod
76
+ def _safe_dim(value, default: int) -> int:
77
+ return value if isinstance(value, int) and value > 0 else default
78
+
79
+ def _letterbox(self, image: ndarray, new_shape: tuple[int, int],
80
+ color=(114, 114, 114)) -> tuple[ndarray, float, tuple[float, float]]:
81
+ h, w = image.shape[:2]
82
+ new_w, new_h = new_shape
83
+ ratio = min(new_w / w, new_h / h)
84
+ rw, rh = int(round(w * ratio)), int(round(h * ratio))
85
+ if (rw, rh) != (w, h):
86
+ interp = cv2.INTER_CUBIC if ratio > 1.0 else cv2.INTER_LINEAR
87
+ image = cv2.resize(image, (rw, rh), interpolation=interp)
88
+ dw, dh = (new_w - rw) / 2.0, (new_h - rh) / 2.0
89
+ padded = cv2.copyMakeBorder(
90
+ image, int(round(dh - 0.1)), int(round(dh + 0.1)),
91
+ int(round(dw - 0.1)), int(round(dw + 0.1)),
92
+ borderType=cv2.BORDER_CONSTANT, value=color)
93
+ return padded, ratio, (dw, dh)
94
+
95
+ def _preprocess(self, image: ndarray) -> tuple[np.ndarray, float, tuple[float, float], tuple[int, int]]:
96
+ orig_h, orig_w = image.shape[:2]
97
+ img, ratio, pad = self._letterbox(image, (self.input_width, self.input_height))
98
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
99
+ img = np.ascontiguousarray(np.transpose(img, (2, 0, 1))[None, ...], dtype=np.float32)
100
+ return img, ratio, pad, (orig_w, orig_h)
101
+
102
+ @staticmethod
103
+ def _clip_boxes(boxes: np.ndarray, size: tuple[int, int]) -> np.ndarray:
104
+ w, h = size
105
+ boxes[:, 0] = np.clip(boxes[:, 0], 0, w - 1)
106
+ boxes[:, 1] = np.clip(boxes[:, 1], 0, h - 1)
107
+ boxes[:, 2] = np.clip(boxes[:, 2], 0, w - 1)
108
+ boxes[:, 3] = np.clip(boxes[:, 3], 0, h - 1)
109
+ return boxes
110
+
111
+ @staticmethod
112
+ def _xywh_to_xyxy(b: np.ndarray) -> np.ndarray:
113
+ o = np.empty_like(b)
114
+ o[:, 0] = b[:, 0] - b[:, 2] / 2
115
+ o[:, 1] = b[:, 1] - b[:, 3] / 2
116
+ o[:, 2] = b[:, 0] + b[:, 2] / 2
117
+ o[:, 3] = b[:, 1] + b[:, 3] / 2
118
+ return o
119
+
120
+ @staticmethod
121
+ def _hard_nms(boxes: np.ndarray, scores: np.ndarray, iou_thresh: float) -> np.ndarray:
122
+ if len(boxes) == 0:
123
+ return np.array([], dtype=np.intp)
124
+ order = np.argsort(scores)[::-1]
125
+ keep = []
126
+ while len(order) > 0:
127
+ i = order[0]
128
+ keep.append(i)
129
+ if len(order) == 1:
130
+ break
131
+ rest = order[1:]
132
+ xx1 = np.maximum(boxes[i, 0], boxes[rest, 0])
133
+ yy1 = np.maximum(boxes[i, 1], boxes[rest, 1])
134
+ xx2 = np.minimum(boxes[i, 2], boxes[rest, 2])
135
+ yy2 = np.minimum(boxes[i, 3], boxes[rest, 3])
136
+ inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
137
+ area_i = max(0, (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1]))
138
+ area_r = np.maximum(0, boxes[rest, 2] - boxes[rest, 0]) * np.maximum(0, boxes[rest, 3] - boxes[rest, 1])
139
+ iou = inter / (area_i + area_r - inter + 1e-7)
140
+ order = rest[iou <= iou_thresh]
141
+ return np.array(keep, dtype=np.intp)
142
+
143
+ @staticmethod
144
+ def _box_iou_one_to_many(box: np.ndarray, boxes: np.ndarray) -> np.ndarray:
145
+ xx1 = np.maximum(box[0], boxes[:, 0])
146
+ yy1 = np.maximum(box[1], boxes[:, 1])
147
+ xx2 = np.minimum(box[2], boxes[:, 2])
148
+ yy2 = np.minimum(box[3], boxes[:, 3])
149
+ inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
150
+ a = max(0, (box[2] - box[0]) * (box[3] - box[1]))
151
+ b = np.maximum(0, boxes[:, 2] - boxes[:, 0]) * np.maximum(0, boxes[:, 3] - boxes[:, 1])
152
+ return inter / (a + b - inter + 1e-7)
153
+
154
+ def _filter_sane(self, boxes, scores, cls_ids, orig_size):
155
+ if len(boxes) == 0:
156
+ return boxes, scores, cls_ids
157
+ ow, oh = orig_size
158
+ area_img = float(ow * oh)
159
+ keep = []
160
+ for i, box in enumerate(boxes):
161
+ bw, bh = box[2] - box[0], box[3] - box[1]
162
+ if bw <= 0 or bh <= 0 or bw < self.min_w or bh < self.min_h:
163
+ continue
164
+ area = bw * bh
165
+ if area < self.min_box_area or area > self.max_box_area_ratio * area_img:
166
+ continue
167
+ if max(bw / max(bh, 1e-6), bh / max(bw, 1e-6)) > self.max_aspect_ratio:
168
+ continue
169
+ keep.append(i)
170
+ if not keep:
171
+ return np.empty((0, 4), dtype=np.float32), np.empty(0, dtype=np.float32), np.empty(0, dtype=np.int32)
172
+ k = np.array(keep, dtype=np.intp)
173
+ return boxes[k], scores[k], cls_ids[k]
174
+
175
+ def _decode_raw_yolo(self, preds, ratio, pad, orig_size):
176
+ if preds.ndim == 3 and preds.shape[0] == 1:
177
+ preds = preds[0]
178
+ if preds.shape[0] <= 16 and preds.shape[1] > preds.shape[0]:
179
+ preds = preds.T
180
+
181
+ boxes_xywh = preds[:, :4].astype(np.float32)
182
+ tail = preds[:, 4:]
183
+
184
+ if tail.shape[1] == 1:
185
+ scores = tail[:, 0]
186
+ cls_ids = np.zeros(len(scores), dtype=np.int32)
187
+ else:
188
+ cls_ids = np.argmax(tail, axis=1).astype(np.int32)
189
+ scores = tail[np.arange(len(tail)), cls_ids]
190
+
191
+ # person only (class 0)
192
+ mask = (cls_ids == 0) & (scores >= self.conf_thres)
193
+ boxes_xywh, scores, cls_ids = boxes_xywh[mask], scores[mask], cls_ids[mask]
194
+ if len(boxes_xywh) == 0:
195
+ return []
196
+
197
+ boxes = self._xywh_to_xyxy(boxes_xywh)
198
+ boxes[:, [0, 2]] -= pad[0]
199
+ boxes[:, [1, 3]] -= pad[1]
200
+ boxes /= ratio
201
+ boxes = self._clip_boxes(boxes, orig_size)
202
+ boxes, scores, cls_ids = self._filter_sane(boxes, scores, cls_ids, orig_size)
203
+ if len(boxes) == 0:
204
+ return []
205
+
206
+ keep = self._hard_nms(boxes, scores, self.iou_thres)[:self.max_det]
207
+ return [BoundingBox(
208
+ x1=int(math.floor(boxes[i, 0])), y1=int(math.floor(boxes[i, 1])),
209
+ x2=int(math.ceil(boxes[i, 2])), y2=int(math.ceil(boxes[i, 3])),
210
+ cls_id=0, conf=float(scores[i]))
211
+ for i in keep if boxes[i, 2] > boxes[i, 0] and boxes[i, 3] > boxes[i, 1]]
212
+
213
+ def _decode_final_dets(self, preds, ratio, pad, orig_size):
214
+ if preds.ndim == 3 and preds.shape[0] == 1:
215
+ preds = preds[0]
216
+ boxes = preds[:, :4].astype(np.float32)
217
+ scores = preds[:, 4].astype(np.float32)
218
+ cls_ids = preds[:, 5].astype(np.int32)
219
+
220
+ mask = (cls_ids == 0) & (scores >= self.conf_thres)
221
+ boxes, scores, cls_ids = boxes[mask], scores[mask], cls_ids[mask]
222
+ if len(boxes) == 0:
223
+ return []
224
+
225
+ boxes[:, [0, 2]] -= pad[0]
226
+ boxes[:, [1, 3]] -= pad[1]
227
+ boxes /= ratio
228
+ boxes = self._clip_boxes(boxes, orig_size)
229
+ boxes, scores, cls_ids = self._filter_sane(boxes, scores, cls_ids, orig_size)
230
+ if len(boxes) == 0:
231
+ return []
232
+
233
+ keep = self._hard_nms(boxes, scores, self.iou_thres)[:self.max_det]
234
+ return [BoundingBox(
235
+ x1=int(math.floor(boxes[i, 0])), y1=int(math.floor(boxes[i, 1])),
236
+ x2=int(math.ceil(boxes[i, 2])), y2=int(math.ceil(boxes[i, 3])),
237
+ cls_id=0, conf=float(scores[i]))
238
+ for i in keep if boxes[i, 2] > boxes[i, 0] and boxes[i, 3] > boxes[i, 1]]
239
+
240
+ def _postprocess(self, output, ratio, pad, orig_size):
241
+ if output.ndim == 2 and output.shape[1] >= 6:
242
+ return self._decode_final_dets(output, ratio, pad, orig_size)
243
+ if output.ndim == 3 and output.shape[0] == 1 and output.shape[2] >= 6:
244
+ return self._decode_final_dets(output, ratio, pad, orig_size)
245
+ return self._decode_raw_yolo(output, ratio, pad, orig_size)
246
+
247
+ def _predict_single(self, image: np.ndarray) -> list[BoundingBox]:
248
+ if image.dtype != np.uint8:
249
+ image = image.astype(np.uint8)
250
+ tensor, ratio, pad, orig_size = self._preprocess(image)
251
+ outputs = self.session.run(self.output_names, {self.input_name: tensor})
252
+ return self._postprocess(outputs[0], ratio, pad, orig_size)
253
+
254
+ def _merge_tta(self, boxes_orig, boxes_flip):
255
+ if not boxes_orig and not boxes_flip:
256
+ return []
257
+
258
+ co = np.array([[b.x1, b.y1, b.x2, b.y2] for b in boxes_orig], dtype=np.float32) if boxes_orig else np.empty((0, 4), dtype=np.float32)
259
+ so = np.array([b.conf for b in boxes_orig], dtype=np.float32) if boxes_orig else np.empty(0, dtype=np.float32)
260
+ cf = np.array([[b.x1, b.y1, b.x2, b.y2] for b in boxes_flip], dtype=np.float32) if boxes_flip else np.empty((0, 4), dtype=np.float32)
261
+ sf = np.array([b.conf for b in boxes_flip], dtype=np.float32) if boxes_flip else np.empty(0, dtype=np.float32)
262
+
263
+ acc_b, acc_s = [], []
264
+
265
+ for i in range(len(co)):
266
+ if so[i] >= self.conf_high:
267
+ acc_b.append(co[i]); acc_s.append(so[i])
268
+ elif len(cf) > 0:
269
+ ious = self._box_iou_one_to_many(co[i], cf)
270
+ j = int(np.argmax(ious))
271
+ if ious[j] >= self.tta_match_iou:
272
+ acc_b.append(co[i]); acc_s.append(max(so[i], sf[j]))
273
+
274
+ for i in range(len(cf)):
275
+ if sf[i] < self.conf_high:
276
+ continue
277
+ if len(co) == 0:
278
+ acc_b.append(cf[i]); acc_s.append(sf[i]); continue
279
+ if np.max(self._box_iou_one_to_many(cf[i], co)) < self.tta_match_iou:
280
+ acc_b.append(cf[i]); acc_s.append(sf[i])
281
+
282
+ if not acc_b:
283
+ return []
284
+
285
+ boxes = np.array(acc_b, dtype=np.float32)
286
+ scores = np.array(acc_s, dtype=np.float32)
287
+ keep = self._hard_nms(boxes, scores, self.iou_thres)[:self.max_det]
288
+
289
+ return [BoundingBox(
290
+ x1=int(math.floor(boxes[i, 0])), y1=int(math.floor(boxes[i, 1])),
291
+ x2=int(math.ceil(boxes[i, 2])), y2=int(math.ceil(boxes[i, 3])),
292
+ cls_id=0, conf=float(scores[i])) for i in keep]
293
+
294
+ def _predict_tta(self, image: np.ndarray) -> list[BoundingBox]:
295
+ boxes_orig = self._predict_single(image)
296
+ flipped = cv2.flip(image, 1)
297
+ boxes_flip_raw = self._predict_single(flipped)
298
+ w = image.shape[1]
299
+ boxes_flip = [BoundingBox(x1=w - b.x2, y1=b.y1, x2=w - b.x1, y2=b.y2,
300
+ cls_id=b.cls_id, conf=b.conf) for b in boxes_flip_raw]
301
+ return self._merge_tta(boxes_orig, boxes_flip)
302
+
303
+ def predict_batch(self, batch_images: list[ndarray], offset: int, n_keypoints: int) -> list[TVFrameResult]:
304
+ results = []
305
+ for i, image in enumerate(batch_images):
306
+ try:
307
+ boxes = self._predict_tta(image) if self.use_tta else self._predict_single(image)
308
+ except Exception as e:
309
+ print(f"Inference failed frame {offset + i}: {e}")
310
+ boxes = []
311
+ results.append(TVFrameResult(
312
+ frame_id=offset + i, boxes=boxes,
313
+ keypoints=[(0, 0) for _ in range(max(0, int(n_keypoints)))]))
314
+ return results